From a34975d5a1fde659b44ce3d378b93bfa24916f1f Mon Sep 17 00:00:00 2001
From: "Meterelliyoz, Mesut" <mesut.meterelliyoz@intel.com>
Date: Mon, 18 May 2020 22:09:11 -0700
Subject: [PATCH 1/5] Enabling Unified Shared Memory (USM) interfaces

---
 cmake/FindMKL.cmake                           |    3 +-
 docs/domains/blas/asum.rst                    |  112 +-
 docs/domains/blas/axpy.rst                    |  134 +-
 docs/domains/blas/axpy_batch.rst              |  160 +
 .../blas/blas-level-1-routines.inc.rst        |   92 -
 docs/domains/blas/blas-level-1-routines.rst   |   76 +
 .../blas/blas-level-2-routines.inc.rst        |  130 -
 docs/domains/blas/blas-level-2-routines.rst   |  104 +
 .../blas/blas-level-3-routines.inc.rst        |   90 -
 docs/domains/blas/blas-level-3-routines.rst   |   56 +
 docs/domains/blas/blas-like-extensions.rst    |   47 +
 docs/domains/blas/blas.rst                    |   13 +-
 docs/domains/blas/copy.rst                    |  123 +-
 docs/domains/blas/dot.rst                     |  131 +-
 docs/domains/blas/dotc.rst                    |  120 +-
 docs/domains/blas/dotu.rst                    |  124 +-
 docs/domains/blas/gbmv.rst                    |  166 +-
 docs/domains/blas/gemm.rst                    |  195 +-
 docs/domains/blas/gemm_batch.rst              |  514 ++-
 docs/domains/blas/gemm_ext.rst                |  348 +-
 docs/domains/blas/gemmt.rst                   |  222 +-
 docs/domains/blas/gemv.rst                    |  166 +-
 docs/domains/blas/ger.rst                     |  144 +-
 docs/domains/blas/gerc.rst                    |  144 +-
 docs/domains/blas/geru.rst                    |  145 +-
 docs/domains/blas/hbmv.rst                    |  161 +-
 docs/domains/blas/hemm.rst                    |  209 +-
 docs/domains/blas/hemv.rst                    |  155 +-
 docs/domains/blas/her.rst                     |  151 +-
 docs/domains/blas/her2.rst                    |  163 +-
 docs/domains/blas/her2k.rst                   |  183 +-
 docs/domains/blas/herk.rst                    |  172 +-
 docs/domains/blas/hpmv.rst                    |  155 +-
 docs/domains/blas/hpr.rst                     |  153 +-
 docs/domains/blas/hpr2.rst                    |  166 +-
 docs/domains/blas/iamax.rst                   |  113 +-
 docs/domains/blas/iamin.rst                   |  107 +-
 docs/domains/blas/nrm2.rst                    |  116 +-
 docs/domains/blas/rot.rst                     |  138 +-
 docs/domains/blas/rotg.rst                    |  118 +-
 docs/domains/blas/rotm.rst                    |  181 +-
 docs/domains/blas/rotmg.rst                   |  168 +-
 docs/domains/blas/sbmv.rst                    |  159 +-
 docs/domains/blas/scal.rst                    |  113 +-
 docs/domains/blas/sdsdot.rst                  |  129 +-
 docs/domains/blas/spmv.rst                    |  149 +-
 docs/domains/blas/spr.rst                     |  147 +-
 docs/domains/blas/spr2.rst                    |  153 +-
 docs/domains/blas/swap.rst                    |  135 +-
 docs/domains/blas/symm.rst                    |  200 +-
 docs/domains/blas/symv.rst                    |  151 +-
 docs/domains/blas/syr.rst                     |  147 +-
 docs/domains/blas/syr2.rst                    |  159 +-
 docs/domains/blas/syr2k.rst                   |  187 +-
 docs/domains/blas/syrk.rst                    |  164 +-
 docs/domains/blas/tbmv.rst                    |  157 +-
 docs/domains/blas/tbsv.rst                    |  156 +-
 docs/domains/blas/tpmv.rst                    |  147 +-
 docs/domains/blas/tpsv.rst                    |  147 +-
 docs/domains/blas/trmm.rst                    |  196 +-
 docs/domains/blas/trmv.rst                    |  151 +-
 docs/domains/blas/trsm.rst                    |  193 +-
 docs/domains/blas/trsm_batch.rst              |  335 +-
 docs/domains/blas/trsv.rst                    |  151 +-
 include/onemkl/blas/blas.hpp                  | 2071 +++++++++-
 .../onemkl/blas/detail/blas_ct_templates.hpp  | 2089 ++++++++++
 include/onemkl/blas/detail/blas_loader.hpp    |  861 ++++-
 include/onemkl/blas/detail/cublas/blas_ct.hpp | 3127 +++++++++------
 .../blas/detail/cublas/onemkl_blas_cublas.hpp |  847 ++++-
 include/onemkl/blas/detail/mklcpu/blas_ct.hpp | 3127 +++++++++------
 .../blas/detail/mklcpu/onemkl_blas_mklcpu.hpp |  960 ++++-
 include/onemkl/blas/detail/mklgpu/blas_ct.hpp | 3125 +++++++++------
 .../blas/detail/mklgpu/onemkl_blas_mklgpu.hpp |  984 ++++-
 include/onemkl/blas/predicates.hpp            | 3355 +++++++++++++++--
 src/blas/backends/cublas/cublas_batch.cpp     |  213 +-
 .../backends/cublas/cublas_extensions.cpp     |   61 +-
 src/blas/backends/cublas/cublas_helper.hpp    |    2 +-
 src/blas/backends/cublas/cublas_level1.cpp    |  268 ++
 src/blas/backends/cublas/cublas_level2.cpp    |  505 +++
 src/blas/backends/cublas/cublas_level3.cpp    |  219 ++
 .../cublas/mkl_blas_cublas_wrappers.cpp       |  174 +-
 src/blas/backends/mklcpu/cpu_batch.cpp        | 1244 +++---
 src/blas/backends/mklcpu/cpu_extensions.cpp   |  102 +
 src/blas/backends/mklcpu/cpu_level1.cpp       |  820 ++++
 src/blas/backends/mklcpu/cpu_level2.cpp       | 1299 +++++++
 src/blas/backends/mklcpu/cpu_level3.cpp       |  692 ++++
 .../backends/mklcpu/mkl_blas_cpu_wrappers.cpp |  174 +-
 src/blas/backends/mklgpu/CMakeLists.txt       |    2 +-
 .../backends/mklgpu/mkl_blas_gpu_wrappers.cpp |  174 +-
 .../backends/mklgpu/mkl_blas_sycl_buffer.cpp  |  102 -
 .../backends/mklgpu/mkl_blas_sycl_usm.cpp     | 1332 +++++++
 .../mklgpu/mkl_internal_blas_gpu_wrappers.cpp | 2590 +++++++++----
 .../mklgpu/mkl_internal_blas_gpu_wrappers.hpp | 1099 +++++-
 .../mklgpu/mkl_internal_blas_sycl_gpu.hpp     |  976 ++++-
 src/blas/blas_loader.cpp                      | 1419 ++++++-
 src/blas/function_table.hpp                   |  910 ++++-
 src/include/exceptions_helper.hpp             |   34 +
 tests/unit_tests/CMakeLists.txt               |    2 +
 tests/unit_tests/blas/batch/CMakeLists.txt    |    2 +-
 .../unit_tests/blas/batch/axpy_batch_usm.cpp  |  239 ++
 tests/unit_tests/blas/batch/gemm_batch.cpp    |  308 --
 .../blas/batch/gemm_batch_stride.cpp          |   23 +-
 .../blas/batch/gemm_batch_stride_usm.cpp      |  228 ++
 .../unit_tests/blas/batch/gemm_batch_usm.cpp  |  370 ++
 tests/unit_tests/blas/batch/trsm_batch.cpp    |  297 --
 .../blas/batch/trsm_batch_stride.cpp          |   23 +-
 .../unit_tests/blas/extensions/CMakeLists.txt |    2 +-
 tests/unit_tests/blas/extensions/gemm_ext.cpp |  100 +-
 .../blas/extensions/gemm_ext_off.cpp          |   54 +-
 tests/unit_tests/blas/extensions/gemmt.cpp    |  269 +-
 .../unit_tests/blas/extensions/gemmt_usm.cpp  |  289 ++
 tests/unit_tests/blas/include/test_common.hpp |   37 +
 tests/unit_tests/blas/level1/CMakeLists.txt   |    2 +-
 tests/unit_tests/blas/level1/asum.cpp         |   39 +-
 tests/unit_tests/blas/level1/asum_usm.cpp     |  144 +
 tests/unit_tests/blas/level1/axpy.cpp         |   36 +-
 tests/unit_tests/blas/level1/axpy_usm.cpp     |  145 +
 tests/unit_tests/blas/level1/copy.cpp         |   36 +-
 tests/unit_tests/blas/level1/copy_usm.cpp     |  139 +
 tests/unit_tests/blas/level1/dot.cpp          |   30 +-
 tests/unit_tests/blas/level1/dot_usm.cpp      |  136 +
 tests/unit_tests/blas/level1/dotc.cpp         |   24 +-
 tests/unit_tests/blas/level1/dotc_usm.cpp     |  134 +
 tests/unit_tests/blas/level1/dotu.cpp         |   24 +-
 tests/unit_tests/blas/level1/dotu_usm.cpp     |  133 +
 tests/unit_tests/blas/level1/iamax.cpp        |   36 +-
 tests/unit_tests/blas/level1/iamax_usm.cpp    |  139 +
 tests/unit_tests/blas/level1/iamin.cpp        |   36 +-
 tests/unit_tests/blas/level1/iamin_usm.cpp    |  139 +
 tests/unit_tests/blas/level1/nrm2.cpp         |   36 +-
 tests/unit_tests/blas/level1/nrm2_usm.cpp     |  140 +
 tests/unit_tests/blas/level1/rot.cpp          |   36 +-
 tests/unit_tests/blas/level1/rot_usm.cpp      |  150 +
 tests/unit_tests/blas/level1/rotg.cpp         |   57 +-
 tests/unit_tests/blas/level1/rotg_usm.cpp     |  158 +
 tests/unit_tests/blas/level1/rotm.cpp         |   60 +-
 tests/unit_tests/blas/level1/rotm_usm.cpp     |  161 +
 tests/unit_tests/blas/level1/rotmg.cpp        |   16 +-
 tests/unit_tests/blas/level1/rotmg_usm.cpp    |  141 +
 tests/unit_tests/blas/level1/scal.cpp         |   39 +-
 tests/unit_tests/blas/level1/scal_usm.cpp     |  153 +
 tests/unit_tests/blas/level1/sdsdot.cpp       |   18 +-
 tests/unit_tests/blas/level1/sdsdot_usm.cpp   |  126 +
 tests/unit_tests/blas/level1/swap.cpp         |   36 +-
 tests/unit_tests/blas/level1/swap_usm.cpp     |  141 +
 tests/unit_tests/blas/level2/CMakeLists.txt   |    2 +-
 tests/unit_tests/blas/level2/gbmv.cpp         |  114 +-
 tests/unit_tests/blas/level2/gbmv_usm.cpp     |  205 +
 tests/unit_tests/blas/level2/gemv.cpp         |  115 +-
 tests/unit_tests/blas/level2/gemv_usm.cpp     |  204 +
 tests/unit_tests/blas/level2/ger.cpp          |   24 +-
 tests/unit_tests/blas/level2/ger_usm.cpp      |  136 +
 tests/unit_tests/blas/level2/gerc.cpp         |   24 +-
 tests/unit_tests/blas/level2/gerc_usm.cpp     |  136 +
 tests/unit_tests/blas/level2/geru.cpp         |   24 +-
 tests/unit_tests/blas/level2/geru_usm.cpp     |  136 +
 tests/unit_tests/blas/level2/hbmv.cpp         |   42 +-
 tests/unit_tests/blas/level2/hbmv_usm.cpp     |  159 +
 tests/unit_tests/blas/level2/hemv.cpp         |   38 +-
 tests/unit_tests/blas/level2/hemv_usm.cpp     |  158 +
 tests/unit_tests/blas/level2/her.cpp          |   36 +-
 tests/unit_tests/blas/level2/her2.cpp         |   50 +-
 tests/unit_tests/blas/level2/her2_usm.cpp     |  155 +
 tests/unit_tests/blas/level2/her_usm.cpp      |  153 +
 tests/unit_tests/blas/level2/hpmv.cpp         |   46 +-
 tests/unit_tests/blas/level2/hpmv_usm.cpp     |  156 +
 tests/unit_tests/blas/level2/hpr.cpp          |   42 +-
 tests/unit_tests/blas/level2/hpr2.cpp         |   40 +-
 tests/unit_tests/blas/level2/hpr2_usm.cpp     |  145 +
 tests/unit_tests/blas/level2/hpr_usm.cpp      |  153 +
 tests/unit_tests/blas/level2/sbmv.cpp         |   40 +-
 tests/unit_tests/blas/level2/sbmv_usm.cpp     |  148 +
 tests/unit_tests/blas/level2/spmv.cpp         |   38 +-
 tests/unit_tests/blas/level2/spmv_usm.cpp     |  144 +
 tests/unit_tests/blas/level2/spr.cpp          |   36 +-
 tests/unit_tests/blas/level2/spr2.cpp         |   36 +-
 tests/unit_tests/blas/level2/spr2_usm.cpp     |  141 +
 tests/unit_tests/blas/level2/spr_usm.cpp      |  139 +
 tests/unit_tests/blas/level2/symv.cpp         |   38 +-
 tests/unit_tests/blas/level2/symv_usm.cpp     |  145 +
 tests/unit_tests/blas/level2/syr.cpp          |   36 +-
 tests/unit_tests/blas/level2/syr2.cpp         |   38 +-
 tests/unit_tests/blas/level2/syr2_usm.cpp     |  142 +
 tests/unit_tests/blas/level2/syr_usm.cpp      |  140 +
 tests/unit_tests/blas/level2/tbmv.cpp         |  218 +-
 tests/unit_tests/blas/level2/tbmv_usm.cpp     |  237 ++
 tests/unit_tests/blas/level2/tbsv.cpp         |  218 +-
 tests/unit_tests/blas/level2/tbsv_usm.cpp     |  237 ++
 tests/unit_tests/blas/level2/tpmv.cpp         |  186 +-
 tests/unit_tests/blas/level2/tpmv_usm.cpp     |  220 ++
 tests/unit_tests/blas/level2/tpsv.cpp         |  186 +-
 tests/unit_tests/blas/level2/tpsv_usm.cpp     |  220 ++
 tests/unit_tests/blas/level2/trmv.cpp         |  208 +-
 tests/unit_tests/blas/level2/trmv_usm.cpp     |  232 ++
 tests/unit_tests/blas/level2/trsv.cpp         |  208 +-
 tests/unit_tests/blas/level2/trsv_usm.cpp     |  232 ++
 tests/unit_tests/blas/level3/CMakeLists.txt   |    2 +-
 tests/unit_tests/blas/level3/gemm.cpp         |  158 +-
 tests/unit_tests/blas/level3/gemm_usm.cpp     |  220 ++
 tests/unit_tests/blas/level3/hemm.cpp         |   46 +-
 tests/unit_tests/blas/level3/hemm_usm.cpp     |  154 +
 tests/unit_tests/blas/level3/her2k.cpp        |   62 +-
 tests/unit_tests/blas/level3/her2k_usm.cpp    |  160 +
 tests/unit_tests/blas/level3/herk.cpp         |   62 +-
 tests/unit_tests/blas/level3/herk_usm.cpp     |  158 +
 tests/unit_tests/blas/level3/symm.cpp         |   78 +-
 tests/unit_tests/blas/level3/symm_usm.cpp     |  178 +
 tests/unit_tests/blas/level3/syr2k.cpp        |   92 +-
 tests/unit_tests/blas/level3/syr2k_usm.cpp    |  183 +
 tests/unit_tests/blas/level3/syrk.cpp         |   82 +-
 tests/unit_tests/blas/level3/syrk_usm.cpp     |  178 +
 tests/unit_tests/blas/level3/trmm.cpp         |  304 +-
 tests/unit_tests/blas/level3/trmm_usm.cpp     |  287 ++
 tests/unit_tests/blas/level3/trsm.cpp         |  492 +--
 tests/unit_tests/blas/level3/trsm_usm.cpp     |  383 ++
 tests/unit_tests/include/test_helper.hpp      |   40 +
 216 files changed, 48420 insertions(+), 11089 deletions(-)
 create mode 100644 docs/domains/blas/axpy_batch.rst
 delete mode 100644 docs/domains/blas/blas-level-1-routines.inc.rst
 create mode 100644 docs/domains/blas/blas-level-1-routines.rst
 delete mode 100644 docs/domains/blas/blas-level-2-routines.inc.rst
 create mode 100644 docs/domains/blas/blas-level-2-routines.rst
 delete mode 100644 docs/domains/blas/blas-level-3-routines.inc.rst
 create mode 100644 docs/domains/blas/blas-level-3-routines.rst
 create mode 100644 docs/domains/blas/blas-like-extensions.rst
 create mode 100644 include/onemkl/blas/detail/blas_ct_templates.hpp
 create mode 100644 src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp
 create mode 100644 src/include/exceptions_helper.hpp
 create mode 100644 tests/unit_tests/blas/batch/axpy_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/gemm_batch.cpp
 create mode 100644 tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp
 create mode 100644 tests/unit_tests/blas/batch/gemm_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/trsm_batch.cpp
 create mode 100644 tests/unit_tests/blas/extensions/gemmt_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/asum_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/axpy_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/copy_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/dot_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/dotc_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/dotu_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/iamax_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/iamin_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/nrm2_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/rot_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/rotg_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/rotm_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/rotmg_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/scal_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/sdsdot_usm.cpp
 create mode 100644 tests/unit_tests/blas/level1/swap_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/gbmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/gemv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/ger_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/gerc_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/geru_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/hbmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/hemv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/her2_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/her_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/hpmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/hpr2_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/hpr_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/sbmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/spmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/spr2_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/spr_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/symv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/syr2_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/syr_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/tbmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/tbsv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/tpmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/tpsv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/trmv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level2/trsv_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/gemm_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/hemm_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/her2k_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/herk_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/symm_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/syr2k_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/syrk_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/trmm_usm.cpp
 create mode 100644 tests/unit_tests/blas/level3/trsm_usm.cpp

diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
index c5656ab91..9a210fba7 100644
--- a/cmake/FindMKL.cmake
+++ b/cmake/FindMKL.cmake
@@ -66,6 +66,7 @@ if(UNIX)
   list(APPEND MKL_LINK_PREFIX "-L${MKL_LIB_DIR}")
   set(LIB_PREFIX "-l")
   set(OPENCL_LIBNAME "OpenCL")
+  set(SYCL_LINK_FLAGS "-fsycl")
 else()
   if(${BUILD_SHARED_LIBS})
     set(MKL_COPT ${MKL_COPT} "-Donemkl_EXPORTS")
@@ -84,7 +85,7 @@ if (ENABLE_MKLCPU_BACKEND OR ENABLE_MKLGPU_BACKEND)
     list(APPEND MKL_LINK_C ${TBB_LINK})
   endif()
   if(ENABLE_MKLGPU_BACKEND)
-    set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME})
+    set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} ${SYCL_LINK_FLAGS})
   endif()
 endif()
 
diff --git a/docs/domains/blas/asum.rst b/docs/domains/blas/asum.rst
index 8e7f53092..4a6092971 100644
--- a/docs/domains/blas/asum.rst
+++ b/docs/domains/blas/asum.rst
@@ -1,4 +1,4 @@
-.. _asum:
+.. _onemkl_blas_asum:
 
 asum
 ====
@@ -10,16 +10,6 @@ asum
    Computes the sum of magnitudes of the vector elements.
 
 
-   .. container:: section
-      :name: GUID-C135E117-8018-473E-BE83-8833C95BB3B5
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void asum(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T_res,1> &result)
 
       ``asum`` supports the following precisions.
 
@@ -42,11 +32,9 @@ asum
 
 
 .. container:: section
-   :name: GUID-6AFCECB5-6614-46AC-B921-AB5DED0D22B2
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -61,16 +49,27 @@ asum
    where ``x`` is a vector with ``n`` elements.
 
 
+asum (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::asum(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T_res,1> &result)
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -86,15 +85,13 @@ asum
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
 .. container:: section
-   :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -103,19 +100,84 @@ asum
       the real and imaginary parts of all elements of the vector).
 
 
-.. container:: familylinks
+asum (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::asum(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
 
+      queue
+         The queue where the routine should be executed.
 
-.. |image0| image:: ../equations/GUID-684BB993-83CA-4605-BD49-E493806C1ee1.png
+
+      n
+         Number of elements in vector ``x``.
+
+
+      x
+         Pointer to input vector ``x``. The array holding the vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      result
+         Pointer to the output matrix where the scalar result is stored
+         (the sum of magnitudes of the real and imaginary parts of all
+         elements of the vector).
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
+.. |image0| image:: ../equations/GUID-4F76F5A1-251F-4AC0-A2E0-A3B4B6F39ee1.png
    :class: img-middle
 
diff --git a/docs/domains/blas/axpy.rst b/docs/domains/blas/axpy.rst
index 50f2ed986..b88309698 100644
--- a/docs/domains/blas/axpy.rst
+++ b/docs/domains/blas/axpy.rst
@@ -1,4 +1,4 @@
-.. _axpy:
+.. _onemkl_blas_axpy:
 
 axpy
 ====
@@ -10,16 +10,6 @@ axpy
    Computes a vector-scalar product and adds the result to a vector.
 
 
-   .. container:: section
-      :name: GUID-17ADB23B-C9B0-44B4-89F9-B7199DA9E872
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void axpy(queue &exec_queue, std::int64_t n, T      alpha, buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y,      std::int64_t incy)
 
       ``axpy`` supports the following precisions.
 
@@ -37,11 +27,9 @@ axpy
 
 
 .. container:: section
-   :name: GUID-4BC6BF9A-BAB9-4078-A6B5-9C7ECB9D4821
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -64,21 +52,32 @@ axpy
    ``alpha`` is a scalar.
 
 
+axpy (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::axpy(sycl::queue &queue, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy)
 .. container:: section
-   :name: GUID-6F86EF6A-8FFE-4C6A-8B71-23B95C1F1365
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    n
-      Number of elements in vector x.
+      Number of elements in vector ``x``.
 
 
    alpha
@@ -86,50 +85,127 @@ axpy
 
 
    x
-      Buffer holding input vector x. The buffer must be of size at least
+      Buffer holding input vector ``x``. The buffer must be of size at least
       ``(1 + (n – 1)*abs(incx))``. See `Matrix and Vector
       Storage <../matrix-storage.html>`__ for
       more details.
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
    y
-      Buffer holding input vector y. The buffer must be of size at least
+      Buffer holding input vector ``y``. The buffer must be of size at least
       ``(1 + (n – 1)*abs(incy))``. See `Matrix and Vector
       Storage <../matrix-storage.html>`__ for
       more details.
 
 
    incy
-      Stride of vector y.
+      Stride of vector ``y``.
 
 
 .. container:: section
-   :name: GUID-A0926D96-B673-48A4-986A-033719589288
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    y
-      Buffer holding the updated vector y.
+      Buffer holding the updated vector ``y``.
 
 
+axpy (USM Version)
+------------------
 
-.. container:: familylinks
+.. container::
 
+   .. container:: section
 
-   .. container:: parentlink
 
+      .. rubric:: Syntax
+         :class: sectiontitle
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
 
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::axpy(sycl::queue &queue, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      n
+         Number of elements in vector ``x``.
 
-.. container::
 
+      alpha
+         Specifies the scalar alpha.
+
+
+      x
+         Pointer to the input vector ``x``. The array holding the vector
+         ``x`` must be of size at least ``(1 + (n – 1)*abs(incx))``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to the input vector ``y``. The array holding the vector
+         ``y`` must be of size at least ``(1 + (n – 1)*abs(incy))``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/axpy_batch.rst b/docs/domains/blas/axpy_batch.rst
new file mode 100644
index 000000000..4dd9fb86e
--- /dev/null
+++ b/docs/domains/blas/axpy_batch.rst
@@ -0,0 +1,160 @@
+.. _onemkl_blas_axpy_batch:
+
+axpy_batch
+==========
+
+.. container::
+
+
+   The ``axpy_batch`` routines are batched versions of `axpy <axpy.html>`__, performing
+   multiple ``axpy`` operations in a single call. Each ``axpy`` 
+   operation adds a scalar-vector product to a vector.
+   
+
+      ``axpy_batch`` supports the following precisions.
+
+
+      .. list-table:: 
+         :header-rows: 1
+
+         * -  T 
+         * -  ``float`` 
+         * -  ``double`` 
+         * -  ``std::complex<float>`` 
+         * -  ``std::complex<double>`` 
+
+
+
+axpy_batch (USM Version)
+------------------------
+
+.. container:: section
+
+
+   .. rubric:: Description
+      :class: sectiontitle
+
+
+   The USM version of ``axpy_batch`` supports group API. 
+
+   The group API operation is defined as
+  
+   ::
+      
+      idx = 0
+      for i = 0 … group_count – 1
+          for j = 0 … group_size – 1
+              X and Y are vectors in x[idx] and y[idx]
+              Y := alpha[i] * X + Y
+              idx := idx + 1
+          end for
+      end for
+
+
+   where:
+
+   ``alpha`` is scalar
+
+   ``X`` and ``Y`` are vectors.
+
+
+   For group API, ``x`` and ``y`` arrays contain the pointers for all the input vectors. 
+   The total number of vectors in ``x`` and ``y`` are given by:
+
+      total_batch_count = sum of all of the group_size entries
+
+
+   **Group API**
+
+.. container:: section
+
+
+   .. rubric:: Syntax
+      :class: sectiontitle
+
+
+   .. container:: dlsyntaxpara
+
+
+      .. cpp:function::  sycl::event onemkl::blas::axpy_batch(sycl::queue &queue, std::int64_t *n, T *alpha, const T **x, std::int64_t *incx, T **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, const sycl::vector_class<sycl::event> &dependencies = {})
+
+    
+.. container:: section
+
+
+   .. rubric:: Input Parameters
+      :class: sectiontitle
+
+   queue
+      The queue where the routine should be executed.
+
+   n
+      Array of ``group_count`` integers. ``n[i]`` specifies the number of elements in vectors ``X`` and ``Y`` for every vector in group ``i``.
+
+
+   alpha
+       Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor for vector ``X`` in group ``i``.
+
+
+   x
+      Array of pointers to input vectors ``X`` with size ``total_batch_count``.
+      The size of array allocated for the ``X`` vector of the group ``i`` must be at least ``(1 + (n[i] – 1)*abs(incx[i]))``. 
+      See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details.
+
+   incx
+      Array of ``group_count`` integers. ``incx[i]`` specifies the stride of vector ``X`` in group ``i``.
+ 
+   y
+      Array of pointers to input/output vectors ``Y`` with size ``total_batch_count``.
+      The size of array allocated for the ``Y`` vector of the group ``i`` must be at least ``(1 + (n[i] – 1)*abs(incy[i]))``. 
+      See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details.
+
+   incy
+      Array of ``group_count`` integers. ``incy[i]`` specifies the stride of vector ``Y`` in group ``i``.
+
+
+   group_count
+      Number of groups. Must be at least 0.
+
+
+   group_size
+      Array of ``group_count`` integers. ``group_size[i]`` specifies the number of ``axpy`` operations in group ``i``. 
+      Each element in ``group_size`` must be at least 0.
+
+   dependencies
+      List of events to wait for before starting computation, if any.
+      If omitted, defaults to no dependencies.
+
+
+.. container:: section
+
+
+   .. rubric:: Output Parameters
+      :class: sectiontitle
+
+
+   y
+      Array of pointers holding the ``Y`` vectors, overwritten by ``total_batch_count`` ``axpy`` operations of the form 
+      ``alpha*X + Y``.
+
+
+.. container:: section
+
+
+   .. rubric:: Return Values
+      :class: sectiontitle
+
+
+   Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:**:ref:`blas-like-extensions`
+      
+
+
diff --git a/docs/domains/blas/blas-level-1-routines.inc.rst b/docs/domains/blas/blas-level-1-routines.inc.rst
deleted file mode 100644
index 7798d13a4..000000000
--- a/docs/domains/blas/blas-level-1-routines.inc.rst
+++ /dev/null
@@ -1,92 +0,0 @@
-.. _blas-level-1-routines:
-
-BLAS Level 1 Routines
-=====================
-
-
-.. container::
-
-
-   BLAS Level 1 includes routines and functions, which perform
-   vector-vector operations. The following table lists the BLAS Level 1
-   routine and function groups and the data types associated with them.
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routine or Function Group with SYCL Buffer
-           -     Data Types     
-           -     Description     
-         * -           \ `asum <asum.html>`__\    
-           -     float, double, mixed float and std::complex<float>,       mixed double and std::complex<double>    
-           -     Sum of vector magnitudes (functions)     
-         * -           \ `axpy <axpy.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Scalar-vector product (routines)     
-         * -           \ `copy <copy.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Copy vector (routines)     
-         * -           \ `dot <dot.html>`__\    
-           -     float, double, mixed float and double     
-           -     Dot product (functions)     
-         * -           \ `sdsdot <sdsdot.html>`__\    
-           -     mixed float and double     
-           -     Dot product with double precision (functions)     
-         * -           \ `dotc <dotc.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Dot product conjugated (functions)     
-         * -           \ `dotu <dotu.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Dot product unconjugated (functions)     
-         * -           \ `nrm2 <nrm2.html>`__\    
-           -     float, double, mixed float and std::complex<float>,       mixed double and std::complex<double>    
-           -     Vector 2-norm (Euclidean norm) (functions)     
-         * -           \ `rot <rot.html>`__\    
-           -     float, double, mixed float and std::complex<float>,       mixed double and std::complex<double>    
-           -     Plane rotation of points (routines)     
-         * -           \ `rotg <rotg.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Generate Givens rotation of points (routines)     
-         * -           \ `rotm <rotm.html>`__\    
-           -     float, double     
-           -     Modified Givens plane rotation of points (routines)          
-         * -           \ `rotmg <rotmg.html>`__\    
-           -     float, double     
-           -     Generate modified Givens plane rotation of points       (routines)    
-         * -           \ `scal <scal.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>, mixed float and std::complex<float>, mixed      double and std::complex<double>    
-           -     Vector-scalar product (routines)     
-         * -           \ `swap <swap.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Vector-vector swap (routines)     
-         * -           \ `iamax <iamax.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Index of the maximum absolute value element of a       vector (functions)    
-         * -           \ `iamin <iamin.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Index of the minimum absolute value element of a       vector (functions)    
-
-.. toctree::
-    :hidden:
-
-    asum
-    axpy
-    copy
-    dot
-    dotc
-    dotu
-    iamax
-    iamin
-    nrm2
-    rot
-    rotg
-    rotm
-    rotmg
-    scal
-    sdsdot
-    swap
-
diff --git a/docs/domains/blas/blas-level-1-routines.rst b/docs/domains/blas/blas-level-1-routines.rst
new file mode 100644
index 000000000..569749ee4
--- /dev/null
+++ b/docs/domains/blas/blas-level-1-routines.rst
@@ -0,0 +1,76 @@
+.. _blas-level-1-routines:
+
+BLAS Level 1 Routines
+=====================
+
+
+.. container::
+
+
+   BLAS Level 1 includes routines which perform
+   vector-vector operations as described in the following table. 
+
+
+   .. container:: tablenoborder
+
+
+      .. list-table:: 
+         :header-rows: 1
+
+         * -     Routines
+           -     Description     
+         * -     \ `asum <asum.html>`__\   
+           -     Sum of vector magnitudes      
+         * -     \ `axpy <axpy.html>`__\   
+           -     Scalar-vector product      
+         * -     \ `copy <copy.html>`__\   
+           -     Copy vector      
+         * -     \ `dot <dot.html>`__\   
+           -     Dot product      
+         * -     \ `sdsdot <sdsdot.html>`__\   
+           -     Dot product with double precision      
+         * -     \ `dotc <dotc.html>`__\   
+           -     Dot product conjugated      
+         * -     \ `dotu <dotu.html>`__\
+           -     Dot product unconjugated      
+         * -     \ `nrm2 <nrm2.html>`__\   
+           -     Vector 2-norm (Euclidean norm)      
+         * -     \ `rot <rot.html>`__\
+           -     Plane rotation of points      
+         * -     \ `rotg <rotg.html>`__\   
+           -     Generate Givens rotation of points      
+         * -     \ `rotm <rotm.html>`__\   
+           -     Modified Givens plane rotation of points           
+         * -     \ `rotmg <rotmg.html>`__\   
+           -     Generate modified Givens plane rotation of points           
+         * -     \ `scal <scal.html>`__\
+           -     Vector-scalar product      
+         * -     \ `swap <swap.html>`__\   
+           -     Vector-vector swap      
+         * -     \ `iamax <iamax.html>`__\   
+           -     Index of the maximum absolute value element of a vector     
+         * -     \ `iamin <iamin.html>`__\   
+           -     Index of the minimum absolute value element of a vector     
+
+.. toctree::
+    :hidden:
+
+    asum
+    axpy
+    copy
+    dot
+    sdsdot
+    dotc
+    dotu
+    nrm2
+    rot
+    rotg
+    rotm
+    rotmg
+    scal
+    swap
+    iamax
+    iamin
+
+
+**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas-level-2-routines.inc.rst b/docs/domains/blas/blas-level-2-routines.inc.rst
deleted file mode 100644
index dbe97bbae..000000000
--- a/docs/domains/blas/blas-level-2-routines.inc.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-.. _blas-level-2-routines:
-
-BLAS Level 2 Routines
-=====================
-
-
-.. container::
-
-
-   This section describes BLAS Level 2 routines, which perform
-   matrix-vector operations. The following table lists the BLAS Level 2
-   routine groups and the data types associated with them.
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routine or Function Group with SYCL Buffer
-           -     Data Types     
-           -     Description     
-         * -           \ `gbmv <gbmv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Matrix-vector product using a general band matrix          
-         * -           \ `gemv <gemv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Matrix-vector product using a general matrix     
-         * -           \ `ger <ger.html>`__\    
-           -     float, double     
-           -     Rank-1 update of a general matrix     
-         * -           \ `gerc <gerc.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Rank-1 update of a conjugated general matrix     
-         * -           \ `geru <geru.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Rank-1 update of a general matrix, unconjugated          
-         * -           \ `hbmv <hbmv.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Matrix-vector product using a Hermitian band matrix          
-         * -           \ `hemv <hemv.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Matrix-vector product using a Hermitian matrix          
-         * -           \ `her <her.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Rank-1 update of a Hermitian matrix     
-         * -           \ `her2 <her2.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Rank-2 update of a Hermitian matrix     
-         * -           \ `hpmv <hpmv.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Matrix-vector product using a Hermitian packed matrix          
-         * -           \ `hpr <hpr.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Rank-1 update of a Hermitian packed matrix     
-         * -           \ `hpr2 <hpr2.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Rank-2 update of a Hermitian packed matrix     
-         * -           \ `sbmv <sbmv.html>`__\    
-           -     float, double     
-           -     Matrix-vector product using symmetric band matrix          
-         * -           \ `spmv <spmv.html>`__\    
-           -     float, double     
-           -     Matrix-vector product using a symmetric packed matrix          
-         * -           \ `spr <spr.html>`__\    
-           -     float, double     
-           -     Rank-1 update of a symmetric packed matrix     
-         * -           \ `spr2 <spr2.html>`__\    
-           -     float, double     
-           -     Rank-2 update of a symmetric packed matrix     
-         * -           \ `symv <symv.html>`__\    
-           -     float, double     
-           -     Matrix-vector product using a symmetric matrix          
-         * -           \ `syr <syr.html>`__\    
-           -     float, double     
-           -     Rank-1 update of a symmetric matrix     
-         * -           \ `syr2 <syr2.html>`__\    
-           -     float, double     
-           -     Rank-2 update of a symmetric matrix     
-         * -           \ `tbmv <tbmv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Matrix-vector product using a triangular band matrix          
-         * -           \ `tbsv <tbsv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Solution of a linear system of equations with a       triangular band matrix    
-         * -           \ `tpmv <tpmv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Matrix-vector product using a triangular packed matrix          
-         * -           \ `tpsv <tpsv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Solution of a linear system of equations with a       triangular packed matrix    
-         * -           \ `trmv <trmv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Matrix-vector product using a triangular matrix          
-         * -           \ `trsv <trsv.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Solution of a linear system of equations with a       triangular matrix    
-
-
-
-
-.. toctree::
-    :hidden:
-
-    gbmv
-    gemv
-    ger
-    gerc
-    geru
-    hbmv
-    hemv
-    her
-    her2
-    hpmv
-    hpr
-    hpr2
-    sbmv
-    spmv
-    spr
-    spr2
-    symv
-    syr
-    syr2
-    tbmv
-    tbsv
-    tpmv
-    tpsv
-    trmv
-    trsv
diff --git a/docs/domains/blas/blas-level-2-routines.rst b/docs/domains/blas/blas-level-2-routines.rst
new file mode 100644
index 000000000..1ff643beb
--- /dev/null
+++ b/docs/domains/blas/blas-level-2-routines.rst
@@ -0,0 +1,104 @@
+.. _blas-level-2-routines:
+
+BLAS Level 2 Routines
+=====================
+
+
+.. container::
+
+   BLAS Level 2 includes routines which perform
+   matrix-vector operations as described in the following table. 
+
+
+   .. container:: tablenoborder
+
+
+      .. list-table:: 
+         :header-rows: 1
+
+         * -     Routines
+           -     Description  
+         * -     \ `gbmv <gbmv.html>`__\   
+           -     Matrix-vector product using a general band matrix         
+         * -     \ `gemv <gemv.html>`__\   
+           -     Matrix-vector product using a general matrix     
+         * -     \ `ger <ger.html>`__\   
+           -     Rank-1 update of a general matrix     
+         * -     \ `gerc <gerc.html>`__\   
+           -     Rank-1 update of a conjugated general matrix     
+         * -     \ `geru <geru.html>`__\   
+           -     Rank-1 update of a general matrix, unconjugated          
+         * -     \ `hbmv <hbmv.html>`__\   
+           -     Matrix-vector product using a Hermitian band matrix          
+         * -     \ `hemv <hemv.html>`__\
+           -     Matrix-vector product using a Hermitian matrix          
+         * -     \ `her <her.html>`__\   
+           -     Rank-1 update of a Hermitian matrix     
+         * -     \ `her2 <her2.html>`__\   
+           -     Rank-2 update of a Hermitian matrix     
+         * -     \ `hpmv <hpmv.html>`__\   
+           -     Matrix-vector product using a Hermitian packed matrix          
+         * -     \ `hpr <hpr.html>`__\   
+           -     Rank-1 update of a Hermitian packed matrix     
+         * -     \ `hpr2 <hpr2.html>`__\   
+           -     Rank-2 update of a Hermitian packed matrix     
+         * -     \ `sbmv <sbmv.html>`__\   
+           -     Matrix-vector product using symmetric band matrix          
+         * -     \ `spmv <spmv.html>`__\   
+           -     Matrix-vector product using a symmetric packed matrix          
+         * -     \ `spr <spr.html>`__\   
+           -     Rank-1 update of a symmetric packed matrix     
+         * -     \ `spr2 <spr2.html>`__\   
+           -     Rank-2 update of a symmetric packed matrix     
+         * -     \ `symv <symv.html>`__\   
+           -     Matrix-vector product using a symmetric matrix          
+         * -     \ `syr <syr.html>`__\   
+           -     Rank-1 update of a symmetric matrix     
+         * -     \ `syr2 <syr2.html>`__\   
+           -     Rank-2 update of a symmetric matrix     
+         * -     \ `tbmv <tbmv.html>`__\   
+           -     Matrix-vector product using a triangular band matrix          
+         * -     \ `tbsv <tbsv.html>`__\   
+           -     Solution of a linear system of equations with a triangular band matrix    
+         * -     \ `tpmv <tpmv.html>`__\   
+           -     Matrix-vector product using a triangular packed matrix          
+         * -     \ `tpsv <tpsv.html>`__\   
+           -     Solution of a linear system of equations with a triangular packed matrix    
+         * -     \ `trmv <trmv.html>`__\   
+           -     Matrix-vector product using a triangular matrix          
+         * -     \ `trsv <trsv.html>`__\   
+           -     Solution of a linear system of equations with a triangular matrix    
+
+
+
+
+.. toctree::
+    :hidden:
+
+    gbmv
+    gemv
+    ger
+    gerc
+    geru
+    hbmv
+    hemv
+    her
+    her2
+    hpmv
+    hpr
+    hpr2
+    sbmv
+    spmv
+    spr
+    spr2
+    symv
+    syr
+    syr2
+    tbmv
+    tbsv
+    tpmv
+    tpsv
+    trmv
+    trsv
+
+**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas-level-3-routines.inc.rst b/docs/domains/blas/blas-level-3-routines.inc.rst
deleted file mode 100644
index a80c18bfc..000000000
--- a/docs/domains/blas/blas-level-3-routines.inc.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-.. _blas-level-3-routines:
-
-BLAS Level 3 Routines
-=====================
-
-
-.. container::
-
-
-   BLAS Level 3 routines perform matrix-matrix operations. The following
-   table lists the BLAS Level 3 routine groups and the data types
-   associated with them.
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routine or Function Group with SYCL Buffer
-           -     Data Types     
-           -     Description     
-         * -           \ `gemm <gemm.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Computes a matrix-matrix product with general       matrices.   
-         * -           \ `hemm <hemm.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Computes a matrix-matrix product where one input       matrix is Hermitian and one is general.   
-         * -           \ `herk <herk.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Performs a Hermitian rank-k update.    
-         * -           \ `her2k <her2k.html>`__\    
-           -     std::complex<float>, std::complex<double>     
-           -     Performs a Hermitian rank-2k update.    
-         * -           \ `symm <symm.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Computes a matrix-matrix product where one input       matrix is symmetric and one matrix is general.   
-         * -           \ `syrk <syrk.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Performs a symmetric rank-k update.    
-         * -           \ `syr2k <syr2k.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Performs a symmetric rank-2k update.    
-         * -           \ `trmm <trmm.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Computes a matrix-matrix product where one input       matrix is triangular and one input matrix is general.   
-         * -           \ `trsm <trsm.html>`__\    
-           -     float, double, std::complex<float>,       std::complex<double>    
-           -     Solves a triangular matrix equation (forward or       backward solve).   
-
-
-
-
-   -  
-
-
-      .. container::
-         :name: LI_21BA86AC0A4942A79BA0C7DC4ABC50C4
-
-
-         The BLAS functions are blocked where possible to restructure
-         the code in a way that increases the localization of data
-         reference, enhances cache memory use, and reduces the
-         dependency on the memory bus.
-
-
-   -  
-
-
-      .. container::
-         :name: LI_9D82DEDFA672416D9B3EA8C9C2B6F0A3
-
-
-         The code is distributed across the processors to maximize
-         parallelism.
-
-
-.. toctree::
-    :hidden:
-
-    gemm
-    hemm
-    her2k
-    herk
-    symm
-    syr2k
-    syrk
-    trmm
-    trsm
diff --git a/docs/domains/blas/blas-level-3-routines.rst b/docs/domains/blas/blas-level-3-routines.rst
new file mode 100644
index 000000000..ffdf0b45e
--- /dev/null
+++ b/docs/domains/blas/blas-level-3-routines.rst
@@ -0,0 +1,56 @@
+.. _blas-level-3-routines:
+
+BLAS Level 3 Routines
+=====================
+
+
+.. container::
+
+
+   BLAS Level 3 includes routines which perform
+   matrix-matrix operations as described in the following table. 
+
+
+   .. container:: tablenoborder
+
+
+      .. list-table:: 
+         :header-rows: 1
+
+         * -     Routines
+           -     Description     
+         * -     \ `gemm <gemm.html>`__\   
+           -     Computes a matrix-matrix product with general matrices.   
+         * -     \ `hemm <hemm.html>`__\   
+           -     Computes a matrix-matrix product where one input matrix is Hermitian and one is general.   
+         * -     \ `herk <herk.html>`__\   
+           -     Performs a Hermitian rank-k update.    
+         * -     \ `her2k <her2k.html>`__\   
+           -     Performs a Hermitian rank-2k update.    
+         * -     \ `symm <symm.html>`__\   
+           -     Computes a matrix-matrix product where one input matrix is symmetric and one matrix is general.   
+         * -     \ `syrk <syrk.html>`__\   
+           -     Performs a symmetric rank-k update.    
+         * -     \ `syr2k <syr2k.html>`__\   
+           -     Performs a symmetric rank-2k update.    
+         * -     \ `trmm <trmm.html>`__\   
+           -     Computes a matrix-matrix product where one input matrix is triangular and one input matrix is general.   
+         * -     \ `trsm <trsm.html>`__\   
+           -     Solves a triangular matrix equation (forward or backward solve).   
+
+
+
+.. toctree::
+    :hidden:
+
+    gemm
+    hemm
+    herk
+    her2k
+    symm
+    syrk
+    syr2k
+    trmm
+    trsm
+
+**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas-like-extensions.rst b/docs/domains/blas/blas-like-extensions.rst
new file mode 100644
index 000000000..296ceb522
--- /dev/null
+++ b/docs/domains/blas/blas-like-extensions.rst
@@ -0,0 +1,47 @@
+.. _blas-like-extensions:
+
+BLAS-like Extensions
+====================
+
+
+.. container::
+
+
+   oneAPI Math Kernel Library DPC++ provides additional routines to
+   extend the functionality of the BLAS routines. These include routines
+   to compute many independent matrix-matrix products.
+
+   The following table lists the BLAS-like Extensions with their descriptions.
+
+
+   .. container:: tablenoborder
+
+
+      .. list-table:: 
+         :header-rows: 1
+
+         * -     Routines
+           -     Description     
+         * -     \ `axpy_batch <axpy_batch.html>`__\   
+           -     Computes groups of vector-scalar product added to a vector.
+         * -     \ `gemm_batch <gemm_batch.html>`__\   
+           -     Computes groups of matrix-matrix products with general matrices.   
+         * -     \ `trsm_batch <trsm_batch.html>`__\   
+           -     Solves a triangular matrix equation for a group of matrices.   
+         * -     \ `gemmt <gemmt.html>`__\   
+           -     Computes a matrix-matrix product with general matrices, but updates
+                 only the upper or lower triangular part of the result matrix.
+         * -     \ `gemm_ext <gemm_ext.html>`__\   
+           -     Computes a matrix-matrix product with general matrices
+ 
+
+.. toctree::
+    :hidden:
+
+    axpy_batch
+    gemm_batch
+    trsm_batch
+    gemmt
+    gemm_ext
+
+**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas.rst b/docs/domains/blas/blas.rst
index 313673bf4..c6124fce7 100644
--- a/docs/domains/blas/blas.rst
+++ b/docs/domains/blas/blas.rst
@@ -3,10 +3,15 @@
 BLAS Routines
 +++++++++++++
 
-oneMKL provides a DPC++ interface to the Basic Linear Algebra Subprograms (BLAS) routines.
+oneMKL provides a DPC++ interface to the Basic Linear Algebra Subprograms (BLAS) routines, as well as several BLAS-like extension routines.
+
+.. toctree::
+    :maxdepth: 1
+
+    blas-level-1-routines.rst
+    blas-level-2-routines.rst
+    blas-level-3-routines.rst
+    blas-like-extensions.rst
 
-.. include:: blas-level-1-routines.inc.rst
-.. include:: blas-level-2-routines.inc.rst
-.. include:: blas-level-3-routines.inc.rst
 
 **Parent topic:** :ref:`onemkl`
diff --git a/docs/domains/blas/copy.rst b/docs/domains/blas/copy.rst
index df47419aa..e2a5a3230 100644
--- a/docs/domains/blas/copy.rst
+++ b/docs/domains/blas/copy.rst
@@ -1,4 +1,4 @@
-.. _copy:
+.. _onemkl_blas_copy:
 
 copy
 ====
@@ -10,16 +10,6 @@ copy
    Copies a vector to another vector.
 
 
-   .. container:: section
-      :name: GUID-D6B6C72E-9516-40C9-B034-9F344C41AAF3
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void copy(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y, std::int64_t      incy)
 
       ``copy`` supports the following precisions.
 
@@ -37,11 +27,9 @@ copy
 
 
 .. container:: section
-   :name: GUID-5E0A9C5F-BDD5-41E6-97CD-4316FD58C347
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -54,63 +42,140 @@ copy
       y ←x
 
 
-   where x and y are vectors of n elements.
+   where ``x`` and ``y`` are vectors of n elements.
+
+
+copy (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
+      .. cpp:function::  void onemkl::blas::copy(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy)
 .. container:: section
-   :name: GUID-6F86EF6A-8FFE-4C6A-8B71-23B95C1F1365
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    n
-      Number of elements in vector x.
+      Number of elements in vector ``x``.
 
 
    x
-      Buffer holding input vector x. The buffer must be of size at least
+      Buffer holding input vector ``x``. The buffer must be of size at least
       ``(1 + (n – 1)*abs(incx))``. See `Matrix and Vector
       Storage <../matrix-storage.html>`__ for
       more details.
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
    incy
-      Stride of vector y.
+      Stride of vector ``y``.
 
 
 .. container:: section
-   :name: GUID-4ABB603B-835C-428B-B880-2F088BAB5456
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    y
-      Buffer holding the updated vector y.
+      Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+copy (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::copy(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
 
+      n
+         Number of elements in vector ``x``.
+
+
+      x
+         Pointer to the input vector ``x``. The array holding the vector
+         ``x`` must be of size at least ``(1 + (n – 1)*abs(incx))``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/dot.rst b/docs/domains/blas/dot.rst
index 7388eed8c..7a6a5d7c4 100644
--- a/docs/domains/blas/dot.rst
+++ b/docs/domains/blas/dot.rst
@@ -1,4 +1,4 @@
-.. _dot:
+.. _onemkl_blas_dot:
 
 dot
 ===
@@ -10,16 +10,6 @@ dot
    Computes the dot product of two real vectors.
 
 
-   .. container:: section
-      :name: GUID-13355B56-0278-45E5-B310-3B0AC541C675
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void dot(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y, std::int64_t      incy, buffer<T_res,1> &result)
 
       ``dot`` supports the following precisions.
 
@@ -40,11 +30,9 @@ dot
 
 
 .. container:: section
-   :name: GUID-4BC6BF9A-BAB9-4078-A6B5-9C7ECB9D4821
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -58,7 +46,6 @@ dot
 
 
       .. rubric:: Note
-         :name: note
          :class: NoteTipHead
 
 
@@ -66,51 +53,60 @@ dot
       double), the dot product is computed with double precision.
 
 
+dot (Buffer Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::dot(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T_res,1> &result)
 .. container:: section
-   :name: GUID-6F86EF6A-8FFE-4C6A-8B71-23B95C1F1365
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    n
-      Number of elements in vectors x and y.
+      Number of elements in vectors ``x`` and ``y``.
 
 
    x
-      Buffer holding input vector x. The buffer must be of size at least
+      Buffer holding input vector ``x``. The buffer must be of size at least
       ``(1 + (n – 1)*abs(incx))``. See `Matrix and Vector
       Storage <../matrix-storage.html>`__ for
       more details.
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
    y
-      Buffer holding input vector y. The buffer must be of size at least
+      Buffer holding input vector ``y``. The buffer must be of size at least
       ``(1 + (n – 1)*abs(incy))``. See `Matrix and Vector
       Storage <../matrix-storage.html>`__ for
       more details.
 
 
    incy
-      Stride of vector y.
+      Stride of vector ``y``.
 
 
 .. container:: section
-   :name: GUID-CAAFE234-AF82-4B61-8406-D57EC527BED5
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -118,19 +114,94 @@ dot
       Buffer where the result (a scalar) will be stored.
 
 
-.. container:: familylinks
+dot (USM Version)
+-----------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::dot(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T_res *result, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
 
+      n
+         Number of elements in vectors ``x`` and ``y``.
 
-.. |image0| image:: ../equations/GUID-93DA36DC-40CA-4C01-B883-DABAB0D37ee1.png
+
+      x
+         Pointer to the input vector ``x``. The array holding the vector ``x``
+         must be of size at least ``(1 + (n – 1)*abs(incx))``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to the input vector ``y``. The array holding the vector ``y``
+         must be of size at least ``(1 + (n – 1)*abs(incy))``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      result
+         Pointer to where the result (a scalar) will be stored.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
+.. |image0| image:: ../equations/GUID-75532DED-BE44-4D85-B9C0-99C825778ee1.png
    :class: img-middle
 
diff --git a/docs/domains/blas/dotc.rst b/docs/domains/blas/dotc.rst
index 08e07d1d3..dde06cbf8 100644
--- a/docs/domains/blas/dotc.rst
+++ b/docs/domains/blas/dotc.rst
@@ -1,4 +1,4 @@
-.. _dotc:
+.. _onemkl_blas_dotc:
 
 dotc
 ====
@@ -11,16 +11,6 @@ dotc
    first vector.
 
 
-   .. container:: section
-      :name: GUID-9D36611B-564D-475B-8D98-5F53A4F698F5
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void dotc(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y, std::int64_t      incy, buffer<T,1> &result)
 
       ``dotc`` supports the following precisions.
 
@@ -36,11 +26,9 @@ dotc
 
 
 .. container:: section
-   :name: GUID-3E4588D2-5FDE-43F1-955E-85173AE62252
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -51,16 +39,27 @@ dotc
    |image0|
 
 
+dotc (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &result)
 .. container:: section
-   :name: GUID-38675523-DEDD-4314-8486-7C66614ED2C7
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -91,11 +90,9 @@ dotc
 
 
 .. container:: section
-   :name: GUID-B84A5D05-6B61-4D13-8185-2A349C41CE46
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -103,19 +100,94 @@ dotc
       The buffer where the result (a scalar) is stored.
 
 
-.. container:: familylinks
+dotc (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  void onemkl::blas::dotc(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *result, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
 
+      n
+         The number of elements in vectors ``x`` and ``y``.
 
-.. |image0| image:: ../equations/GUID-AED001B6-9056-491F-ACBE-E06C82D17ee1.png
+
+      x
+         Pointer to input vector ``x``. The array holding the input
+         vector ``x`` must be of size at least (1 + (``n`` -
+         1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         The stride of vector ``x``.
+
+
+      y
+         Pointer to input vector ``y``. The array holding the input
+         vector ``y`` must be of size at least (1 + (``n`` -
+         1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details..
+
+
+      incy
+         The stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      result
+         The pointer to where the result (a scalar) is stored.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
+.. |image0| image:: ../equations/GUID-B2211D34-A472-4FB8-9CFB-1E11AF4F0ee1.png
    :class: img-middle
 
diff --git a/docs/domains/blas/dotu.rst b/docs/domains/blas/dotu.rst
index 15cb71e7d..607989837 100644
--- a/docs/domains/blas/dotu.rst
+++ b/docs/domains/blas/dotu.rst
@@ -1,4 +1,4 @@
-.. _dotu:
+.. _onemkl_blas_dotu:
 
 dotu
 ====
@@ -10,16 +10,6 @@ dotu
    Computes the dot product of two complex vectors.
 
 
-   .. container:: section
-      :name: GUID-27A695AE-7ED5-4CFF-9783-0E50D111BED2
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void dotu(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y, std::int64_t      incy, buffer<T,1> &result)
 
       ``dotu`` supports the following precisions.
 
@@ -35,11 +25,9 @@ dotu
 
 
 .. container:: section
-   :name: GUID-7E67CFC6-917F-41A3-A664-F99EE4E04E43
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -49,16 +37,27 @@ dotu
    |image0|
 
 
+dotu (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &result)
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -74,7 +73,7 @@ dotu
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
    y
@@ -85,15 +84,13 @@ dotu
 
 
    incy
-      Stride of vector y.
+      Stride of vector ``y``.
 
 
 .. container:: section
-   :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -101,19 +98,94 @@ dotu
       Buffer where the result (a scalar) is stored.
 
 
-.. container:: familylinks
+dotu (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::dotu(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *result, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
 
+      n
+         Number of elements in vectors ``x`` and ``y``.
 
-.. |image0| image:: ../equations/GUID-3605ACD9-02D1-46D7-B791-F2F76F0D9ee1.png
+
+      x
+         Pointer to the input vector ``x``. The array holding input
+         vector ``x`` must be of size at least (1 + (``n`` -
+         1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input vector ``y``. The array holding input vector
+         ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      result
+         Pointer to where the result (a scalar) is stored.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
+.. |image0| image:: ../equations/GUID-42AF2BFE-F8F1-4F96-A4E0-05D4FB5A7ee1.png
    :class: img-middle
 
diff --git a/docs/domains/blas/gbmv.rst b/docs/domains/blas/gbmv.rst
index 524d52972..f07fee183 100644
--- a/docs/domains/blas/gbmv.rst
+++ b/docs/domains/blas/gbmv.rst
@@ -1,4 +1,4 @@
-.. _gbmv:
+.. _onemkl_blas_gbmv:
 
 gbmv
 ====
@@ -10,16 +10,6 @@ gbmv
    Computes a matrix-vector product with a general band matrix.
 
 
-   .. container:: section
-      :name: GUID-870EA7B0-09B5-43FF-90A4-6378B5D94B55
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void gbmv(queue &exec_queue, transpose trans,      std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,      T alpha, buffer<T,1> &a, std::int64_t lda, buffer<T,1> &x,      std::int64_t incx, T beta, buffer<T,1> &y, std::int64_t incy)
 
       ``gbmv`` supports the following precisions.
 
@@ -37,11 +27,9 @@ gbmv
 
 
 .. container:: section
-   :name: GUID-71614419-BC91-4A1A-B743-FE52767C4926
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -73,16 +61,27 @@ gbmv
    -  ``x`` and ``y`` are vectors.
 
 
+gbmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -159,11 +158,9 @@ gbmv
 
 
 .. container:: section
-   :name: GUID-4B31584D-BC63-4032-A4A7-61BF3F163165
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -171,15 +168,138 @@ gbmv
       Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+gbmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See
+         :ref:`onemkl_datatypes` for
+         more details.
+
+
+      m
+         Number of rows of ``A``. Must be at least zero.
+
+
+      n
+         Number of columns of ``A``. Must be at least zero.
+
+
+      kl
+         Number of sub-diagonals of the matrix ``A``. Must be at least
+         zero.
+
+
+      ku
+         Number of super-diagonals of the matrix ``A``. Must be at least
+         zero.
 
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least (``kl`` +
+         ``ku`` + 1), and positive.
+
+
+      x
+         Pointer to input vector ``x``. The length ``len`` of vector
+         ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A``
+         is transposed. The array holding input vector ``x`` must be of
+         size at least (1 + (``len`` - 1)*abs(``incx``)). See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      beta
+         Scaling factor for vector ``y``.
+
+
+      y
+         Pointer to input/output vector ``y``. The length ``len`` of
+         vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if
+         ``A`` is transposed. The array holding input/output vector
+         ``y`` must be of size at least (1 + (``len`` -
+         1)*abs(``incy``)) where ``len`` is this length. See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/gemm.rst b/docs/domains/blas/gemm.rst
index 8e529b1e8..9670344b2 100644
--- a/docs/domains/blas/gemm.rst
+++ b/docs/domains/blas/gemm.rst
@@ -1,4 +1,4 @@
-.. _gemm:
+.. _onemkl_blas_gemm:
 
 gemm
 ====
@@ -10,16 +10,6 @@ gemm
    Computes a matrix-matrix product with general matrices.
 
 
-   .. container:: section
-      :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void gemm(queue &exec_queue, transpose transa,      transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,      T alpha, buffer<T,1> &a, std::int64_t lda, buffer<T,1> &b,      std::int64_t ldb, T beta, buffer<T,1> &c, std::int64_t ldc)
 
       ``gemm`` supports the following precisions.
 
@@ -38,15 +28,13 @@ gemm
 
 
 .. container:: section
-   :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The gemm routines compute a scalar-matrix-matrix product and add the
+   The ``gemm`` routines compute a scalar-matrix-matrix product and add the
    result to a scalar-matrix product, with general matrices. The
    operation is defined as
 
@@ -79,31 +67,37 @@ gemm
    ``C`` is an ``m``-by-``n`` matrix.
 
 
+gemm (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, T beta, sycl::buffer<T,1> &c, std::int64_t ldc)
 .. container:: section
-   :name: GUID-D89C4959-F0C2-4E91-8853-9225F0772DF0
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    transa
       Specifies the form of ``op(A)``, the transposition operation
-      applied to ``A``. See
-      :ref:`onemkl_datatypes`
-      for more details.
-
+      applied to ``A``.
 
    transb
       Specifies the form of ``op(B)``, the transposition operation
-      applied to ``B``. See
-      :ref:`onemkl_datatypes`
-      for more details.
+      applied to ``B``.
 
 
    m
@@ -176,11 +170,9 @@ gemm
 
 
 .. container:: section
-   :name: GUID-EEF5C7D0-D206-4961-809F-55DCA3E93F68
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -190,11 +182,9 @@ gemm
 
 
 .. container:: section
-   :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF
 
 
    .. rubric:: Notes
-      :name: notes
       :class: sectiontitle
 
 
@@ -202,15 +192,154 @@ gemm
    calling ``gemm``.
 
 
-.. container:: familylinks
+gemm (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, const T *b, std::int64_t ldb, T beta, T *c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      transa
+         Specifies the form of ``op(A)``, the transposition operation
+         applied to ``A``.
+
+
+      transb
+         Specifies the form of ``op(B)``, the transposition operation
+         applied to ``B``.
+
+
+      m
+         Specifies the number of rows of the matrix ``op(A)`` and of the
+         matrix ``C``. The value of m must be at least zero.
+
+
+      n
+         Specifies the number of columns of the matrix ``op(B)`` and the
+         number of columns of the matrix ``C``. The value of n must be
+         at least zero.
+
+
+      k
+         Specifies the number of columns of the matrix ``op(A)`` and the
+         number of rows of the matrix ``op(B)``. The value of k must be
+         at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-matrix product.
+
 
+      a
+         Pointer to input matrix ``A``. If ``A`` is not transposed,
+         ``A`` is an ``m``-by-``k`` matrix so the array ``a`` must have
+         size at least ``lda``\ \*\ ``k``. If ``A`` is transposed, ``A``
+         is an ``k``-by-``m`` matrix so the array ``a`` must have size
+         at least ``lda``\ \*\ ``m``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         The leading dimension of ``A``. Must be at least m if ``A`` is
+         not transposed, and at least k if ``A`` is transposed. It must
+         be positive.
+
+
+      b
+         Pointer to input matrix ``B``. If ``B`` is not transposed,
+         ``B`` is an ``k``-by-``n`` matrix so the array ``b`` must have
+         size at least ``ldb``\ \*\ ``n``. If ``B`` is transposed, ``B``
+         is an ``n``-by-``k`` matrix so the array ``b`` must have size
+         at least ``ldb``\ \*\ ``k``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldb
+         The leading dimension of ``B``. Must be at least k if ``B`` is
+         not transposed, and at least n if ``B`` is transposed. It must
+         be positive.
+
+
+      beta
+         Scaling factor for matrix ``C``.
+
+
+      c
+         The pointer to input/output matrix ``C``. It must have a size
+         of at least ldc\*n. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldc
+         The leading dimension of ``C``. It must be positive and at
+         least the size of m.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Pointer to the output matrix, overwritten by
+         ``alpha*op(A)*op(B) + beta*C``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Notes
+         :class: sectiontitle
+
+
+      If ``beta`` = 0, matrix ``C`` does not need to be initialized
+      before calling ``gemm``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/gemm_batch.rst b/docs/domains/blas/gemm_batch.rst
index 11034ab68..b4c940afc 100644
--- a/docs/domains/blas/gemm_batch.rst
+++ b/docs/domains/blas/gemm_batch.rst
@@ -1,4 +1,4 @@
-.. _gemm_batch:
+.. _onemkl_blas_gemm_batch:
 
 gemm_batch
 ==========
@@ -6,81 +6,214 @@ gemm_batch
 
 .. container::
 
+   The ``gemm_batch`` routines are batched versions of `gemm <gemm.html>`__, performing
+   multiple ``gemm`` operations in a single call. Each ``gemm`` 
+   operation perform a matrix-matrix product with general matrices.
+   
+  
+      ``gemm_batch`` supports the following precisions.
 
-   Computes groups of matrix-matrix product with general matrices.
 
+      .. list-table:: 
+         :header-rows: 1
 
-   .. container:: section
-      :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD
+         * -  T 
+         * -  ``float`` 
+         * -  ``double`` 
+         * -  ``std::complex<float>`` 
+         * -  ``std::complex<double>`` 
 
 
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
+gemm_batch (Buffer Version)
+---------------------------
 
+.. container:: section
 
-      **Group API**
 
+   .. rubric:: Description
+      :class: sectiontitle
 
-      .. cpp:function::  void gemm_batch(queue &exec_queue,      buffer<transpose, 1> &transa_array, buffer<transpose,1>      &transb_array, buffer<std::int64_t,1> &m_array,      buffer<std::int64_t,1> &n_array, buffer<std::int64_t,1> &k_array,      buffer<T,1> alpha_array, buffer<T,1> &a_array,      buffer<std::int64_t,1> &lda_array, buffer<T,1> &b_array,      buffer<std::int64_t,1> ldb_array, buffer<T,1> &beta_array,      buffer<T,1> &c, buffer<std::int64_t,1> &ldc_array, std::int64_t      group_count, buffer<std::int64_t,1> &group_size_array)
 
-      **Strided API**
+   The buffer version of ``gemm_batch`` supports only the strided API. 
+   
+   The strided API operation is defined as
 
 
-      .. cpp:function::  void gemm_batch(queue &exec_queue, transpose      transa, transpose transb, std::int64_t m, std::int64_t n,      std::int64_t k, T alpha, buffer<T,1> &a, std::int64_t &lda,      std::int64_t stridea, buffer<T,1> &b, std::int64_t ldb,      std::int64_t strideb, T beta, buffer<T,1> &c, std::int64_t ldc,      std::int64_t stridec, std::int64_t batch_size)
+   ::
 
-      ``gemm_batch`` supports the following precisions.
 
+      for i = 0 … batch_size – 1
+          A, B and C are matrices at offset i * stridea, i * strideb, i * stridec in a, b and c.
+          C := alpha * op(A) * op(B) + beta * C
+      end for
 
-      .. list-table:: 
-         :header-rows: 1
 
-         * -  T 
-         * -  ``float`` 
-         * -  ``double`` 
-         * -  ``std::complex<float>`` 
-         * -  ``std::complex<double>`` 
+   where:
+
 
+   op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`
 
 
+   ``alpha`` and ``beta`` are scalars
+
+
+   ``A``, ``B``, and ``C`` are matrices
+
+   op(``A``) is ``m``\ ``x``\ ``k``, op(``B``) is 
+   ``k``\ ``x``\ ``n``, and ``C`` is ``m``\ ``x``\ ``n``.
+
+   The a, b and c buffers contain all the input matrices. The stride 
+   between matrices is given by the stride parameter. The total number
+   of matrices in a, b and c buffers is given by the ``batch_size`` parameter.
+
+   **Strided API**
 
 .. container:: section
-   :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42
 
 
-   .. rubric:: Description
-      :name: description
+   .. rubric:: Syntax
+      :class: sectiontitle
+
+
+   .. cpp:function::  void onemkl::blas::gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, std::int64_t stridea, sycl::buffer<T,1> &b, std::int64_t ldb, std::int64_t strideb, T beta, sycl::buffer<T,1> &c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size)
+
+
+.. container:: section
+
+
+   .. rubric:: Input Parameters
+      :class: sectiontitle
+
+
+   queue
+      The queue where the routine should be executed.
+
+
+   transa
+      Specifies ``op(A)`` the transposition operation applied to the
+      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
+
+   transb
+      Specifies ``op(B)`` the transposition operation applied to the
+      matrices ``B``. See :ref:`onemkl_datatypes` for more details.
+
+   m
+      Number of rows of ``op(A)`` and ``C``. Must be at least zero.
+
+
+   n
+      Number of columns of ``op(B)`` and ``C``. Must be at least zero.
+
+
+   k
+      Number of columns of ``op(A)`` and rows of ``op(B)``. Must be at
+      least zero.
+
+
+   alpha
+      Scaling factor for the matrix-matrix products.
+
+
+   a
+      Buffer holding the input matrices ``A`` with size ``stridea*batch_size``.
+
+
+   lda
+      Leading dimension of the matrices ``A``. Must be at least ``m`` if
+      the matrices ``A`` are not transposed, and at least ``k`` if the
+      matrices ``A`` are transposed. Must be positive.
+
+
+   stridea
+      Stride between different ``A`` matrices.
+
+
+   b
+      Buffer holding the input matrices ``B`` with size ``strideb*batch_size``.
+
+
+   ldb
+      Leading dimension of the matrices ``B``. Must be at least ``k`` if
+      the matrices ``B`` are not transposed, and at least ``n`` if the
+      matrices ``B`` are transposed. Must be positive.
+
+
+   strideb
+      Stride between different ``B`` matrices.
+
+
+   beta
+      Scaling factor for the matrices ``C``.
+
+
+   c
+      Buffer holding input/output matrices ``C`` with size ``stridec*batch_size``.
+
+
+   ldc
+      Leading dimension of ``C``. Must be positive and at least ``m``.
+
+
+   stridec
+      Stride between different ``C`` matrices. Must be at least
+      ``ldc*n``.
+
+
+   batch_size
+      Specifies the number of matrix multiply operations to perform.
+
+
+.. container:: section
+
+
+   .. rubric:: Output Parameters
       :class: sectiontitle
 
 
-   The gemm_batch routines perform a series of matrix-matrix operations
-   with general matrices. They are similar to the gemm routine
-   counterparts, but the gemm_batch routines perform matrix-matrix
-   operations with groups of matrices. The groups contain matrices with
-   the same parameters.
+   c
+      Output buffer, overwritten by ``batch_size`` matrix multiply
+      operations of the form\ ``alpha*op(A)*op(B) + beta*C``.
+
+
+.. container:: section
+
+
+   .. rubric:: Notes
+      :class: sectiontitle
+
+
+   If ``beta`` = 0, matrix ``C`` does not need to be initialized before
+   calling ``gemm_batch``.
+
+
+gemm_batch (USM Version)
+---------------------------
+
+.. container:: section
+
+   .. rubric:: Description
+      :class: sectiontitle
+
 
+   The USM version of ``gemm_batch`` supports the group API and strided API. 
 
-   For the group API, the operation is defined as
+   The group API operation is defined as
 
 
    ::
 
 
-      offa = 0, offb = 0, offc = 0
+      idx = 0
       for i = 0 … group_count – 1
-          transa, transb, m, n, k, lda, ldb, ldc, alpha, beta and group_size at position i in transa_array, transb_array, m_array, n_array, k_array, lda_array, ldb_array, ldc_array, alpha_array, beta_array and group_size_array
-          sizea = transa == onemkl::transpose::N ? lda * k : lda * m;
-          sizeb = transb == onemkl::transpose::N ? ldb * n : ldb * k;
-          sizec = ldc * n;
           for j = 0 … group_size – 1
-              A, B, and C are matrices of size sizea, sizeb and sizec at offset offa, offb and offc in a, b and c.
-              C := alpha * op(A) * op(B) + beta * C
-              offa += sizea, offb += sizeb, offc += sizec
+              A, B, and C are matrices in a[idx], b[idx] and c[idx]
+              C := alpha[i] * op(A) * op(B) + beta[i] * C
+              idx = idx + 1
           end for
       end for
 
 
-   For the strided API, the operation is defined as
+   The strided API operation is defined as
 
 
    ::
@@ -88,220 +221,214 @@ gemm_batch
 
       for i = 0 … batch_size – 1
           A, B and C are matrices at offset i * stridea, i * strideb, i * stridec in a, b and c.
-          C = alpha * op(A) * op(B) + beta * C
+          C := alpha * op(A) * op(B) + beta * C
       end for
 
 
    where:
 
 
-   -  op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) =
-      X\ :sup:`H`
-
+   op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`
 
-   -  ``alpha`` and ``beta`` are scalars
 
+   ``alpha`` and ``beta`` are scalars
 
-   -  ``A``, ``B``, and ``C`` are matrices
 
+   ``A``, ``B``, and ``C`` are matrices
+   
+   op(``A``) is ``m``\ ``x``\ ``k``, op(``B``) is ``k``\ ``x``\ ``n``, and ``C`` is ``m``\ ``x``\ ``n``.
 
-   -  The a, b and c buffers contains all the input matrices. The stride
-      between matrices is either given by the exact size of the matrix
-      (for the group API) or by the stride parameter. The total number
-      of matrices in a, b and c buffers is given by the 
+    
+   For group API, a, b and c arrays contain the pointers for all the input matrices. 
+   The total number of matrices in a, b and c are given by: 
+    
+      total_batch_count = sum of all of the group_size entries    
+    
+    
+   For strided API, a, b, c arrays contain all the input matrices. The total number of matrices 
+   in a, b and c are given by the ``batch_size`` parameter.  
       
-      |image0| 
-      
-      for the
-      group API or by the ``batch_size`` parameter for the strided API.
+   **Group API**
 
+.. container:: section
 
-   Here, op(``A``) is ``m``\ ``x``\ ``k``, op(``B``) is
-   ``k``\ ``x``\ ``n``, and ``C`` is ``m``\ ``x``\ ``n``.
 
+   .. rubric:: Syntax
+      :class: sectiontitle
 
-.. container:: section
-   :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A
 
+   .. container:: dlsyntaxpara
+   
+      .. cpp:function::  sycl::event onemkl::blas::gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, T *alpha, const T **a, std::int64_t *lda, const T **b, std::int64_t *ldb, T *beta, T **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, const sycl::vector_class<sycl::event> &dependencies = {})
 
-   .. rubric:: Input Parameters - Group API
-      :name: input-parameters---group-api
-      :class: sectiontitle
 
+.. container:: section
 
-   transa_array
-      Buffer holding ``group_count onemkl::transpose`` value.
 
+   .. rubric:: Input Parameters
+      :class: sectiontitle
 
-      For the group ``i``, ``transa`` is the ``i``\ th element in the
-      transa_array buffer and specifies the form of ``op(A)`` used in
-      the matrix multiplication. See
-      :ref:`onemkl_datatypes` for more
-      details.
 
+   queue
+      The queue where the routine should be executed.
 
-   transb_array
-      Buffer holding ``group_count onemkl::transpose`` value.
 
+   transa
+      Array of ``group_count`` ``onemkl::transpose`` values. ``transa[i]`` specifies the form of ``op(A)`` used in
+      the matrix multiplication in group ``i``. See :ref:`onemkl_datatypes` for more details.
 
-      For the group ``i``, ``transb`` is the ``i``\ th element in the
-      transb_array buffer and specifies the form of ``op(B)`` used in
-      the matrix multiplication. See
-      :ref:`onemkl_datatypes` for more
-      details.
 
+   transb
+      Array of ``group_count`` ``onemkl::transpose`` values. ``transb[i]`` specifies the form of ``op(B)`` used in
+      the matrix multiplication in group ``i``. See :ref:`onemkl_datatypes` for more details.
 
-   m_array
-      Buffer holding ``group_count`` integer. For the group ``i``, ``m``
-      is the ``i``\ th element in the m_array buffer and specifies the
-      number of rows of ``op(A)`` and ``C``. Must be at least zero.
+
+   m
+      Array of ``group_count`` integers. ``m[i]`` specifies the
+      number of rows of ``op(A)`` and ``C`` for every matrix in group ``i``. All entries must be at least zero.
 
 
-   n_array
-      Buffer holding ``group_count`` integer. For the group ``i``, ``n``
-      is the ``i``\ th element in the n_array buffer and specifies the
-      number of columns of ``op(B)`` and ``C``. Must be at least zero.
+   n
+      Array of ``group_count`` integers. ``n[i]`` specifies the
+      number of columns of ``op(B)`` and ``C`` for every matrix in group ``i``. All entries must be at least zero.
 
 
-   k_array
-      Buffer holding ``group_count`` integer. For the group ``i``, ``k``
-      is the ``i``\ th element in the k_array buffer and specifies the
-      number of columns of ``op(A)`` and rows of ``op(B)``. Must be at
+   k
+      Array of ``group_count`` integers. ``k[i]`` specifies the
+      number of columns of ``op(A)`` and rows of ``op(B)`` for every matrix in group ``i``. All entries must be at
       least zero.
 
 
-   alpha_array
-      Buffer holding ``group_count`` scalar element. For the group
-      ``i``, ``alpha`` is the ``i``\ th element in the alpha_array
-      buffer and specifies the scaling factor for the matrix-matrix
-      product.
+   alpha
+      Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor for every matrix-matrix
+      product in group ``i``.
 
 
    a
-      Buffer holding the input matrices ``A``. The total size of the
-      buffer ``a`` must be at least the sum of the sizes of all the
-      matricies ``A``. That is,
+      Array of pointers to input matrices ``A`` with size ``total_batch_count``. 
+      
+      See `Matrix Storage <../matrix-storage.html>`__ for more details.
 
 
-      |image1|
+   lda
+      Array of ``group_count`` integers. ``lda[i]`` specifies the leading dimension of ``A`` for every matrix in group ``i``. 
+      All entries must be at least ``m``
+      if ``A`` is not transposed, and at least ``k`` if ``A`` is
+      transposed. All entries must be positive.
 
 
-      where
-      ``sizeai = lda_array[i] * (transa == onemkl::transpose::N ? k : m)``
+   b
+      Array of pointers to input matrices ``B`` with size ``total_batch_count``. 
+      
+      See `Matrix Storage <../matrix-storage.html>`__ for more details.
 
 
-      See `Matrix
-      Storage <../matrix-storage.html>`__ for
-      more details.
+   ldb
+      Array of ``group_count`` integers. ``ldb[i]`` specifies the leading dimension of ``B`` for every matrix in group ``i``. 
+      All entries must be at least ``k``
+      if ``B`` is not transposed, and at least ``n`` if ``B`` is
+      transposed. All entries must be positive.
 
 
-   lda_array
-      Buffer holding ``group_count`` integer. For the group ``i``,
-      ``lda`` is the ``i``\ th element in the lda_array buffer and
-      specifies the leading dimension of ``A``. Must be at least ``m``
-      if ``A`` is not transposed, and at least ``k`` if ``A`` is
-      transposed. Must be positive.
+   beta
+      Array of ``group_count`` scalar elements. ``beta[i]`` specifies the scaling factor for matrix ``C`` 
+      for every matrix in group ``i``.
 
 
-   b
-      Buffer holding the input matrices ``B``. The total size of the
-      buffer ``b`` must be at least the sum of the sizes of all the
-      matricies ``B``. That is,
+   c
+      Array of pointers to input/output matrices ``C`` with size ``total_batch_count``. 
+      
+      See `Matrix Storage <../matrix-storage.html>`__ for more details.
+
+
+   ldc
+      Array of ``group_count`` integers. ``ldc[i]`` specifies the leading dimension of ``C`` for every matrix in group ``i``. 
+      All entries must be positive and at least ``m``.
 
 
-      |image2|
+   group_count
+      Specifies the number of groups. Must be at least 0.
 
 
-      where
-      ``sizebi = ldb_array[i] * (transb == onemkl::transpose::N ? n : k)``
+   group_size
+      Array of ``group_count`` integers. ``group_size[i]`` specifies the
+      number of matrix multiply products in group ``i``. All entries must be at least 0.
 
 
-      See `Matrix
-      Storage <../matrix-storage.html>`__ for
-      more details.
+   dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
 
 
-   ldb_array
-      Buffer holding ``group_count`` integer. For the group ``i``,
-      ``ldb`` is the ``i``\ th element in the ldb_array buffer and
-      specifies the leading dimension of ``B``. Must be at least ``k``
-      if ``B`` is not transposed, and at least ``n`` if ``B`` is
-      transposed. Must be positive.
+.. container:: section
 
 
-   beta_array
-      Buffer holding ``group_count`` scalar element. For the group
-      ``i``, ``beta`` is the ``i``\ th element in the beta_array buffer
-      and specifies the scaling factor for matrix C.
+   .. rubric:: Output Parameters
+      :class: sectiontitle
 
 
    c
-      Buffer holding the input/output matrices ``C``. The total size of
-      the buffer ``c`` must be at least the sum of the sizes of all the
-      matricies ``C``. That is,
+      Overwritten by the ``m[i]``-by-``n[i]`` matrix calculated by 
+      ``(alpha[i]*op(A)*op(B) + beta[i]*C)`` for group ``i``.
 
 
-      |image3|
 
+   .. container:: section
 
-      See `Matrix
-      Storage <../matrix-storage.html>`__ for
-      more details.
 
+      .. rubric:: Notes
+         :class: sectiontitle
 
-   ldc_array
-      Buffer holding ``group_count`` integer. For the group ``i``,
-      ``ldc`` is the ``i``\ th element in the ldc_array buffer and
-      specifies the leading dimension of ``C``. Must be positive and at
-      least ``m``.
 
+      If ``beta`` = 0, matrix ``C`` does not need to be initialized
+      before calling ``gemm_batch``.
 
-   group_count
-      Specifies the number of groups. Must be at least 0.
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
 
 
-   group_size_array
-      Buffer holding ``group_count`` integer. For the group ``i``, the
-      ``i``\ th element in the group_size_array buffer specifies the
-      number of matrix multiply operations in group ``i``. Each element
-      in ``group_size_array`` must be at least 0.
+      Output event to wait on to ensure computation is complete.
 
 
+
+
+   **Strided API**
+
 .. container:: section
-   :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700
 
 
-   .. rubric:: Output Parameters - Group API
-      :name: output-parameters---group-api
+   .. rubric:: Syntax
       :class: sectiontitle
 
+   .. container:: dlsyntaxpara
 
-   c
-      Overwritten by the ``m``\ :sub:`i`-by-``n``\ :sub:`i` matrix
-      ``(alphai*op(A)*op(B) + betai*C)`` for group ``i``.
+      .. cpp:function::  sycl::event onemkl::blas::gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, std::int64_t stridea, const T *b, std::int64_t ldb, std::int64_t strideb, T beta, T *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, const sycl::vector_class<sycl::event> &dependencies = {})
 
 
 .. container:: section
-   :name: GUID-D067773A-45A3-4D24-B10A-46E27834947E
 
 
-   .. rubric:: Input Parameters - Strided API
-      :name: input-parameters---strided-api
+   .. rubric:: Input Parameters
       :class: sectiontitle
 
 
+   queue
+      The queue where the routine should be executed.
+
+
    transa
       Specifies ``op(A)`` the transposition operation applied to the
-      matrices A. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
+
 
 
    transb
       Specifies ``op(B)`` the transposition operation applied to the
-      matrices B. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      matrices ``B``. See :ref:`onemkl_datatypes` for more details.
 
 
    m
@@ -322,8 +449,7 @@ gemm_batch
 
 
    a
-      Buffer holding the input matrices ``A``. Must have size at least
-      ``stridea*batch_size``.
+      Pointer to input matrices ``A`` with size ``stridea*batch_size``.
 
 
    lda
@@ -333,20 +459,11 @@ gemm_batch
 
 
    stridea
-      Stride between the different ``A`` matrices.
-
-
-      If ``A`` are not transposed, the matrices ``A`` are ``m``-by-``k``
-      matrices so stridea must be at least ``lda*k``.
-
-
-      If ``A`` are transposed, the matrices ``A`` are ``k``-by-``m``
-      matrices so stridea must be at least ``lda*m``.
+      Stride between different ``A`` matrices.
 
 
    b
-      Buffer holding the input matrices ``B``. Must have size at least
-      ``strideb*batch_size``.
+      Pointer to input matrices ``B`` with size ``strideb*batch_size``.
 
 
    ldb
@@ -356,24 +473,16 @@ gemm_batch
 
 
    strideb
-      Stride between the different ``B`` matrices.
-
-
-      If ``B`` are not transposed, the matrices ``B`` are ``k``-by-``n``
-      matrices so strideb must be at least ``ldb*n``.
+      Stride between different ``B`` matrices.
 
 
-      If ``B`` are transposed, the matrices ``B`` are ``n``-by-``k``
-      matrices so strideb must be at least ``ldb*k``.
-
 
    beta
       Scaling factor for the matrices ``C``.
 
 
    c
-      Buffer holding input/output matrices ``C``. Must have size at
-      least ``stridec*batch_size``.
+      Pointer to input/output matrices ``C`` with size ``stridec*batch_size``.
 
 
    ldc
@@ -381,60 +490,57 @@ gemm_batch
 
 
    stridec
-      Stride between the different ``C`` matrices. Must be at least
-      ``ldc*n``.
+      Stride between different ``C`` matrices.
 
 
    batch_size
       Specifies the number of matrix multiply operations to perform.
 
 
+   dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
 .. container:: section
-   :name: GUID-98C3DE17-4F5F-41A1-B431-48148153ABBA
 
 
-   .. rubric:: Output Parameters - Strided API
-      :name: output-parameters---strided-api
+   .. rubric:: Output Parameters
       :class: sectiontitle
 
 
    c
-      Output buffer, overwritten by ``batch_size`` matrix multiply
-      operations of the form\ ``alpha*op(A)*op(B) + beta*C``.
+      Output matrices, overwritten by ``batch_size`` matrix multiply
+      operations of the form ``alpha*op(A)*op(B) + beta*C``.
 
 
 .. container:: section
-   :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF
 
 
    .. rubric:: Notes
-      :name: notes
       :class: sectiontitle
 
 
    If ``beta`` = 0, matrix ``C`` does not need to be initialized before
-   calling gemm_batch.
+   calling ``gemm_batch``.
 
 
-.. container:: familylinks
+.. container:: section
 
 
-   .. container:: parentlink
+      .. rubric:: Return Values
+         :class: sectiontitle
 
 
-      **Parent topic:** :ref:`blas-like-extensions`
-      
+      Output event to wait on to ensure computation is complete.
 
 
-.. container::
+.. container:: familylinks
+
+
+   .. container:: parentlink
 
 
-.. |image0| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee1.png
-   :class: img-middle
-.. |image1| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee2.png
-   :class: img-middle
-.. |image2| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee3.png
-   :class: img-middle
-.. |image3| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee4.png
-   :class: img-middle
+      **Parent topic:** :ref:`blas-like-extensions`
+      
 
diff --git a/docs/domains/blas/gemm_ext.rst b/docs/domains/blas/gemm_ext.rst
index a49fabe8d..0d2d3c042 100644
--- a/docs/domains/blas/gemm_ext.rst
+++ b/docs/domains/blas/gemm_ext.rst
@@ -1,326 +1,316 @@
-.. _gemm_ext:
+.. _onemkl_blas_gemm_ext:
 
 gemm_ext
 ========
 
-
 .. container::
 
 
    Computes a matrix-matrix product with general matrices.
 
 
-   .. container:: section
-      :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      **Standard API**
-
-
-      .. container:: dlsyntaxpara
-
-
-         .. cpp:function::  void gemm_ext(queue &exec_queue, transpose         transa, transpose transb, std::int64_t m, std::int64_t n,         std::int64_t k, Ts alpha, buffer<Ta,1> &a, std::int64_t lda,         buffer<Tb,1> &b, std::int64_t ldb, Ts beta, buffer<Tc,1> &c,         std::int64_t ldc)
-
-         ``gemm_ext`` supports the following precisions and devices.
-
-
-         .. list-table:: 
-            :header-rows: 1
-
-            * -  Ts 
-              -  Ta 
-              -  Tb 
-              -  Tc 
-            * -  ``float`` 
-              -  ``half`` 
-              -  ``half`` 
-              -  ``float`` 
-            * -  ``half`` 
-              -  ``half`` 
-              -  ``half`` 
-              -  ``half`` 
-            * -  ``float`` 
-              -  ``float`` 
-              -  ``float`` 
-              -  ``float`` 
-            * -  ``double`` 
-              -  ``double`` 
-              -  ``double`` 
-              -  ``double`` 
-            * -  ``std::complex<float>`` 
-              -  ``std::complex<float>`` 
-              -  ``std::complex<float>`` 
-              -  ``std::complex<float>`` 
-            * -  ``std::complex<double>`` 
-              -  ``std::complex<double>`` 
-              -  ``std::complex<double>`` 
-              -  ``std::complex<double>`` 
-
+     **Standard API**
 
+      
+     ``gemm_ext`` supports the following precisions and devices.
 
 
-      **Offset API**
+     .. list-table:: 
+        :header-rows: 1
 
+        * -  Ts 
+          -  Ta 
+          -  Tb 
+          -  Tc 
+        * -  ``float`` 
+          -  ``half`` 
+          -  ``half`` 
+          -  ``float`` 
+        * -  ``half`` 
+          -  ``half`` 
+          -  ``half`` 
+          -  ``half`` 
+        * -  ``float`` 
+          -  ``float`` 
+          -  ``float`` 
+          -  ``float`` 
+        * -  ``double`` 
+          -  ``double`` 
+          -  ``double`` 
+          -  ``double`` 
+        * -  ``std::complex<float>`` 
+          -  ``std::complex<float>`` 
+          -  ``std::complex<float>`` 
+          -  ``std::complex<float>`` 
+        * -  ``std::complex<double>`` 
+          -  ``std::complex<double>`` 
+          -  ``std::complex<double>`` 
+          -  ``std::complex<double>`` 
 
-      .. container:: dlsyntaxpara
 
+     **Offset API**
 
-         .. cpp:function::  void gemm_ext(queue &exec_queue, transpose         transa, transpose transb, offset offset_type, std::int64_t m,         std::int64_t n, std::int64_t k, Ts alpha, buffer<Ta,1> &a,         std::int64_t lda, Ta ao, buffer<Tb,1> &b, std::int64_t ldb, Tb         bo, Ts beta, buffer<Tc,1> &c, std::int64_t ldc, buffer<Tc,1>         &co)
 
-         ``gemm_ext`` supports the following precisions.
+     ``gemm_ext`` supports the following precisions.
 
 
-         .. list-table:: 
-            :header-rows: 1
+     .. list-table:: 
+        :header-rows: 1
 
-            * -  Ts 
-              -  Ta 
-              -  Tb 
-              -  Tc 
-            * -  ``float`` 
-              -  ``int8_t`` 
-              -  ``uint8_t`` 
-              -  ``int32_t`` 
+        * -  Ts 
+          -  Ta 
+          -  Tb 
+          -  Tc 
+        * -  ``float`` 
+          -  ``int8_t`` 
+          -  ``uint8_t`` 
+          -  ``int32_t`` 
 
 
+.. container:: section
 
 
-   .. container:: section
-      :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42
+   .. rubric:: Description
+      :class: sectiontitle
 
 
-      .. rubric:: Description
-         :name: description
-         :class: sectiontitle
+   The gemm_ext routines compute a scalar-matrix-matrix product and
+   add the result to a scalar-matrix product, with general matrices.
+   
+   For Standard API, the operation is defined as:
 
+   ::
 
-      The gemm_ext routines compute a scalar-matrix-matrix product and
-      add the result to a scalar-matrix product, with general matrices.
-      The operation is defined as:
 
+      C ← alpha*op(A)*op(B) + beta*C 
 
-      ::
 
+   For Offset API, the operation is defined as: 
 
-         C ← alpha*op(A)*op(B) + beta*C 
+   ::
 
 
-      for the standard API and
-      ::
+      C ← alpha*(op(A) - A_offset)*(op(B) - B_offset) + beta*C + C_offset
 
+   where:
 
-         C ← alpha*(op(A) - A_offset)*(op(B) - B_offset) + beta*C + C_offset
 
+   op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`
 
-      for the offset API
-      where:
 
+   ``alpha`` and ``beta`` are scalars
 
-      -  op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) =
-         X\ :sup:`H`
 
+   ``A_offset`` is an ``m``-by-``k`` matrix with every element equal to the value ao
 
-      -  ``alpha`` and ``beta`` are scalars
 
+   ``B_offset`` is a ``k``-by-``n`` matrix with every element equal to the value bo
 
-      -  ``A_offset`` is an ``m``-by-``k`` matrix with every element
-         equal to the value ao
 
+   ``C_offset`` is an ``m``-by-``n`` matrix defined by the 
+   co buffer as described below. 
 
-      -  ``B_offset`` is a ``k``-by-``n`` matrix with every element
-         equal to the value bo
 
+   ``A``, ``B``, and ``C`` are matrices
 
-      -  ``C_offset`` is an ``m``-by-``n`` matrix defined by the co
-         buffer as described in
-         :ref:`onemkl_datatypes`
 
+   op(``A``) is ``m`` x ``k``, op(``B``) is ``k`` x ``n``, and
+   ``C`` is ``m`` x ``n``.
 
-      -  ``A``, ``B``, and ``C`` are matrices
 
+gemm_ext (Buffer Version)
+-------------------------
 
-      Here, op(``A``) is ``m`` x ``k``, op(``B``) is ``k`` x ``n``, and
-      ``C`` is ``m`` x ``n``.
-
+.. container::
 
    .. container:: section
-      :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A
-
 
+      .. rubric:: Syntax
+        :class: sectiontitle
+      
+      
+      **Standard API**
+      
+      
+      .. container:: dlsyntaxpara
+      
+      
+        .. cpp:function::  void onemkl::blas::gemm_ext(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, Ts alpha, sycl::buffer<Ta,1> &a, std::int64_t lda, sycl::buffer<Tb,1> &b, std::int64_t ldb, Ts beta, sycl::buffer<Tc,1> &c, std::int64_t ldc)
+      
+      
+      **Offset API**
+      
+      
+      .. container:: dlsyntaxpara
+      
+      
+        .. cpp:function::  void onemkl::blas::gemm_ext(sycl::queue &queue, transpose transa, transpose transb, offset offset_type, std::int64_t m, std::int64_t n, std::int64_t k, Ts alpha, sycl::buffer<Ta,1> &a, std::int64_t lda, Ta ao, sycl::buffer<Tb,1> &b, std::int64_t ldb, Tb bo, Ts beta, sycl::buffer<Tc,1> &c, std::int64_t ldc, sycl::buffer<Tc,1> &co)
+      
+      
+   .. container:: section
+      
+      
       .. rubric:: Input Parameters
-         :name: input-parameters
          :class: sectiontitle
-
-
-      exec_queue
+    
+    
+      queue
          The queue where the routine should be executed.
-
-
+    
+    
       transa
          Specifies op(``A``), the transposition operation applied to
          ``A``. See
          :ref:`onemkl_datatypes` for
          more details.
-
-
+    
+    
+    
       transb
          Specifies op(``B``), the transposition operation applied to
          ``B``. See
          :ref:`onemkl_datatypes` for
          more details.
-
-
+    
+    
+    
       offset_type (offset API only)
          Specifies the form of ``C_offset`` used in the matrix
          multiplication. See
          :ref:`onemkl_datatypes` for
          more details.
-
-
+    
+    
       m
          Number of rows of op(``A``) and ``C``. Must be at least zero.
-
-
+    
+    
       n
          Number of columns of op(``B``) and ``C``. Must be at least
          zero.
-
-
+    
+    
       k
          Number of columns of op(``A``) and rows of op(``B``). Must be
          at least zero.
-
-
+    
+    
       alpha
          Scaling factor for the matrix-matrix product.
-
-
+    
+    
       a
          Buffer holding the input matrix ``A``.
-
-
+    
+    
          If ``A`` is not transposed, ``A`` is an ``m``-by-``k`` matrix
          so the array ``a`` must have size at least ``lda``\ \*\ ``k``.
-
-
+    
+    
          If ``A`` is transposed, ``A`` is a ``k``-by-``m`` matrix so the
          array ``a`` must have size at least ``lda``\ \*\ ``m``.
-
-
+    
+    
          See `Matrix
          Storage <../matrix-storage.html>`__ for
          more details.
-
-
+    
+    
       lda
          Leading dimension of ``A``. Must be at least ``m`` if ``A`` is
          not transposed, and at least ``k`` if ``A`` is transposed. Must
          be positive.
-
-
+    
+    
       ao (offset API only)
          Specifies the scalar offset value for matrix ``A``.
-
-
+    
+    
       b
          Buffer holding the input matrix ``B``.
-
-
+    
+    
          If ``B`` is not transposed, ``B`` is a ``k``-by-``n`` matrix so
          the array ``b`` must have size at least ``ldb``\ \*\ ``n``.
-
-
+    
+    
          If ``B`` is transposed, ``B`` is an ``n``-by-``k`` matrix so
          the array ``b`` must have size at least ``ldb``\ \*\ ``k``.
-
-
+    
+    
          See `Matrix
          Storage <../matrix-storage.html>`__ for
          more details.
-
-
+    
+    
       ldb
          Leading dimension of ``B``. Must be at least ``k`` if ``B`` is
          not transposed, and at least ``n`` if ``B`` is transposed. Must
          be positive.
-
-
+    
+    
       bo (offset API only)
          Specifies the scalar offset value for matrix ``B``.
-
-
+    
+    
       beta
          Scaling factor for matrix ``C``.
-
-
+    
+    
       c
-         Buffer holding the input matrix ``C``. Must have size at least
+         Buffer holding the input/output matrix ``C``. Must have size at least
          ``ldc`` \* ``n``. See `Matrix
          Storage <../matrix-storage.html>`__ for
          more details.
-
-
+    
+    
       ldc
          Leading dimension of ``C``. Must be positive and at least
          ``m``.
-
-
+    
+    
       co (offset API only)
          Buffer holding the offset values for matrix ``C``.
-
-
+    
+    
          If ``offset_type = offset::fix``, the ``co`` array must have
          size at least 1.
-
-
+    
+    
          If ``offset_type = offset::col``, the ``co`` array must have
          size at least ``max(1,m)``.
-
-
+    
+    
          If ``offset_type = offset::row``, the ``co`` array must have
-         size at least ``max(1,n)``.
-
-
-         See
-         :ref:`onemkl_datatypes` for
-         more details.
-
-
+         size at least ``max(1,n)``. 
+    
+    
    .. container:: section
-      :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700
-
-
+    
+    
       .. rubric:: Output Parameters
-         :name: output-parameters
          :class: sectiontitle
-
-
+    
+    
       c
          Output buffer, overwritten by alpha\*op(``A``)*op(``B``) +
          beta\*\ ``C`` for the standard API and alpha\*(op(``A``) -
          ``A_offset``)*(op(``B``) - ``B_offset``) + beta\*\ ``C`` +
          ``C_offset`` for the offset API.
-
-
+    
+    
    .. container:: section
-      :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF
-
-
+    
+    
       .. rubric:: Notes
-         :name: notes
          :class: sectiontitle
-
-
+    
+    
       If ``beta`` = 0, matrix ``C`` does not need to be initialized
       before calling gemm_ext.
 
 
+
 .. container:: familylinks
 
 
@@ -331,5 +321,3 @@ gemm_ext
       
 
 
-.. container::
-
diff --git a/docs/domains/blas/gemmt.rst b/docs/domains/blas/gemmt.rst
index e2ceb077d..7954218fd 100644
--- a/docs/domains/blas/gemmt.rst
+++ b/docs/domains/blas/gemmt.rst
@@ -1,4 +1,4 @@
-.. _gemmt:
+.. _onemkl_blas_gemmt:
 
 gemmt
 =====
@@ -11,19 +11,6 @@ gemmt
    only the upper or lower triangular part of the result matrix.
 
 
-   .. container:: section
-      :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. container:: dlsyntaxpara
-
-
-         .. cpp:function::  void gemmt(queue &exec_queue, uplo         upper_lower, transpose transa, transpose transb, std::int64_t         n, std::int64_t k, T alpha, buffer<T,1> &a, std::int64_t lda,         buffer<T,1> &b, std::int64_t ldb, T beta, buffer<T,1> &c,         std::int64_t ldc)
 
          ``gemmt`` supports the following precisions.
 
@@ -41,11 +28,9 @@ gemmt
 
 
    .. container:: section
-      :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42
 
 
       .. rubric:: Description
-         :name: description
          :class: sectiontitle
 
 
@@ -63,30 +48,180 @@ gemmt
       where:
 
 
-      -  op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) =
-         X\ :sup:`H`
+      op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`
 
 
-      -  ``alpha`` and ``beta`` are scalars
+      ``alpha`` and ``beta`` are scalars
 
 
-      -  ``A``, ``B``, and ``C`` are matrices
+      ``A``, ``B``, and ``C`` are matrices
 
 
-      Here, op(``A``) is ``n`` x ``k``, op(``B``) is ``k`` x ``n``, and
+      op(``A``) is ``n`` x ``k``, op(``B``) is ``k`` x ``n``, and
       ``C`` is ``n`` x ``n``.
 
 
+gemmt (Buffer Version)
+----------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  void onemkl::blas::gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, T beta, sycl::buffer<T,1> &c, std::int64_t ldc)
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``C``\ ’s data is stored in its upper or
+         lower triangle. See :ref:`onemkl_datatypes` for more details.
+
+      
+      transa
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
+
+
+      transb
+         Specifies op(``B``), the transposition operation applied to
+         ``B``. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of columns of op(``A``), columns of op(``B``), and
+         columns of\ ``C``. Must be at least zero.
+
+
+      k
+         Number of columns of op(``A``) and rows of op(``B``). Must be
+         at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-matrix product.
+
+
+      a
+         Buffer holding the input matrix ``A``.
+
+
+         If ``A`` is not transposed, ``A`` is an ``n``-by-``k`` matrix
+         so the array ``a`` must have size at least ``lda``\ \*\ ``k``.
+
+
+         If ``A`` is transposed, ``A`` is a ``k``-by-``n`` matrix so the
+         array ``a`` must have size at least ``lda``\ \*\ ``n``.
+
+
+         See `Matrix Storage <../matrix-storage.html>`__ for more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``n`` if ``A`` is
+         not transposed, and at least ``k`` if ``A`` is transposed. Must
+         be positive.
+
+
+      b
+         Buffer holding the input matrix ``B``.
+
+
+         If ``B`` is not transposed, ``B`` is a ``k``-by-``n`` matrix so
+         the array ``b`` must have size at least ``ldb``\ \*\ ``n``.
+
+
+         If ``B`` is transposed, ``B`` is an ``n``-by-``k`` matrix so
+         the array ``b`` must have size at least ``ldb``\ \*\ ``k``.
+
+
+         See `Matrix Storage <../matrix-storage.html>`__ for more details.
+
+
+      ldb
+         Leading dimension of ``B``. Must be at least ``k`` if ``B`` is
+         not transposed, and at least ``n`` if ``B`` is transposed. Must
+         be positive.
+
+
+      beta
+         Scaling factor for matrix ``C``.
+
+
+      c
+         Buffer holding the input/output matrix ``C``. Must have size at
+         least ``ldc`` \* ``n``. See `Matrix
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldc
+         Leading dimension of ``C``. Must be positive and at least
+         ``m``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Output buffer, overwritten by the upper or lower triangular
+         part of alpha\*op(``A``)*op(``B``) + beta\*\ ``C``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Notes
+         :class: sectiontitle
+
+
+      If ``beta`` = 0, matrix ``C`` does not need to be initialized
+      before calling gemmt.
+
+
+gemmt (USM Version)
+-------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
    .. container:: section
-      :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A
 
 
       .. rubric:: Input Parameters
-         :name: input-parameters
          :class: sectiontitle
 
 
-      exec_queue
+      queue
          The queue where the routine should be executed.
 
 
@@ -97,6 +232,7 @@ gemmt
          more details.
 
 
+
       transa
          Specifies op(``A``), the transposition operation applied to
          ``A``. See
@@ -104,12 +240,14 @@ gemmt
          more details.
 
 
+
       transb
          Specifies op(``B``), the transposition operation applied to
          ``B``. See
          :ref:`onemkl_datatypes` for
          more details.
 
+ 
 
       n
          Number of columns of op(``A``), columns of op(``B``), and
@@ -126,7 +264,7 @@ gemmt
 
 
       a
-         Buffer holding the input matrix ``A``.
+         Pointer to input matrix ``A``.
 
 
          If ``A`` is not transposed, ``A`` is an ``n``-by-``k`` matrix
@@ -149,7 +287,7 @@ gemmt
 
 
       b
-         Buffer holding the input matrix ``B``.
+         Pointer to input matrix ``B``.
 
 
          If ``B`` is not transposed, ``B`` is a ``k``-by-``n`` matrix so
@@ -176,8 +314,8 @@ gemmt
 
 
       c
-         Buffer holding the input/output matrix ``C``. Must have size at
-         least ``ldc`` \* ``n``. See `Matrix
+         Pointer to input/output matrix ``C``. Must have size at least
+         ``ldc`` \* ``n``. See `Matrix
          Storage <../matrix-storage.html>`__ for
          more details.
 
@@ -187,26 +325,27 @@ gemmt
          ``m``.
 
 
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
    .. container:: section
-      :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700
 
 
       .. rubric:: Output Parameters
-         :name: output-parameters
          :class: sectiontitle
 
 
       c
-         Output buffer, overwritten by the upper or lower triangular
-         part ofalpha\*op(``A``)*op(``B``) + beta\*\ ``C``.
+         Pointer to the output matrix, overwritten by the upper or lower
+         triangular part of alpha\*op(``A``)*op(``B``) + beta\*\ ``C``.
 
 
    .. container:: section
-      :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF
 
 
       .. rubric:: Notes
-         :name: notes
          :class: sectiontitle
 
 
@@ -214,15 +353,20 @@ gemmt
       before calling gemmt.
 
 
-.. container:: familylinks
+   .. container:: section
 
 
-   .. container:: parentlink
+      .. rubric:: Return Values
+         :class: sectiontitle
 
 
-      **Parent topic:** :ref:`blas-like-extensions`
-      
+      Output event to wait on to ensure computation is complete.
 
 
-.. container::
+.. container:: familylinks
 
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/gemv.rst b/docs/domains/blas/gemv.rst
index 1345bcdf0..48a1b439c 100644
--- a/docs/domains/blas/gemv.rst
+++ b/docs/domains/blas/gemv.rst
@@ -1,4 +1,4 @@
-.. _gemv:
+.. _onemkl_blas_gemv:
 
 gemv
 ====
@@ -10,18 +10,8 @@ gemv
    Computes a matrix-vector product using a general matrix.
 
 
-   .. container:: section
-      :name: GUID-EA8D6705-E7C2-42E2-BE80-D9AD83645FCC
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
 
-      .. cpp:function::  void gemv(queue &exec_queue, transpose trans,      std::int64_t m, std::int64_t n, T alpha, buffer<T,1> &a,      std::int64_t lda, buffer<T,1> &x, std::int64_t incx, T beta,      buffer<T,1> &y, std::int64_t incy)
-
-      gemv supports the following precisions.
+      ``gemv`` supports the following precisions.
 
 
       .. list-table:: 
@@ -37,15 +27,13 @@ gemv
 
 
 .. container:: section
-   :name: GUID-AE220EED-6066-4881-8B3C-35207BAB0105
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The gemv routines compute a scalar-matrix-vector product and add the
+   The ``gemv`` routines compute a scalar-matrix-vector product and add the
    result to a scalar-vector product, with a general matrix. The
    operation is defined as
 
@@ -69,24 +57,32 @@ gemv
    ``A`` is an ``m``-by-``n`` matrix, and ``x``, ``y`` are vectors.
 
 
+gemv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
 .. container:: section
-   :name: GUID-F3E8F201-6033-45A1-A326-CA4CFB631C3A
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    trans
       Specifies ``op(A)``, the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
 
 
    m
@@ -105,7 +101,7 @@ gemv
 
    a
       The buffer holding the input matrix ``A``. Must have a size of at
-      least ``lda``\ \*n. See `Matrix and Vector
+      least ``lda``\ \*``n``. See `Matrix and Vector
       Storage <../matrix-storage.html>`__ for
       more details.
 
@@ -147,11 +143,9 @@ gemv
 
 
 .. container:: section
-   :name: GUID-1533BCA6-E652-4A08-A82D-162F3CEBDD29
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -159,16 +153,130 @@ gemv
       The buffer holding updated vector ``y``.
 
 
+gemv (USM Version)
+------------------
 
-.. container:: familylinks
+.. container::
 
+   .. container:: section
 
-   .. container:: parentlink
 
+      .. rubric:: Syntax
+         :class: sectiontitle
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
 
+      .. container:: dlsyntaxpara
 
-.. container::
 
+         .. cpp:function::  sycl::event onemkl::blas::gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      trans
+         Specifies ``op(A)``, the transposition operation applied to
+         ``A``. See
+         :ref:`onemkl_datatypes` for
+         more details.
+
+
+
+      m
+         Specifies the number of rows of the matrix ``A``. The value of
+         ``m`` must be at least zero.
+
+
+      n
+         Specifies the number of columns of the matrix ``A``. The value
+         of ``n`` must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         The pointer to the input matrix ``A``. Must have a size of at
+         least ``lda``\ \*``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         The leading dimension of matrix ``A``. It must be at least m,
+         and positive.
+
+
+      x
+         Pointer to the input vector ``x``. The length ``len`` of vector
+         ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A``
+         is transposed. The array holding vector ``x`` must be of size
+         at least (1 + (``len`` - 1)*abs(``incx``)). See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         The stride of vector ``x``.
+
+
+      beta
+         The scaling factor for vector ``y``.
+
+
+      y
+         Pointer to input/output vector ``y``. The length ``len`` of
+         vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if
+         ``A`` is transposed. The array holding input/output vector
+         ``y`` must be of size at least (1 + (``len`` -
+         1)*abs(``incy``)) where ``len`` is this length. See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         The stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         The pointer to updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/ger.rst b/docs/domains/blas/ger.rst
index 1a122ce84..1877a5e9d 100644
--- a/docs/domains/blas/ger.rst
+++ b/docs/domains/blas/ger.rst
@@ -1,4 +1,4 @@
-.. _ger:
+.. _onemkl_blas_ger:
 
 ger
 ===
@@ -10,16 +10,6 @@ ger
    Computes a rank-1 update of a general matrix.
 
 
-   .. container:: section
-      :name: GUID-0DA23698-EB19-4AAF-A5FD-9BB530A9EFE0
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void ger(queue &exec_queue, std::int64_t m,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &y, std::int64_t incy, buffer<T,1> &a, std::int64_t      lda)
 
       ``ger`` supports the following precisions.
 
@@ -35,21 +25,16 @@ ger
 
 
 .. container:: section
-   :name: GUID-72E035B0-E1C2-442B-AE9D-2CB873E90FAF
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The ger routines compute a scalar-vector-vector product and add the
+   The ``ger`` routines compute a scalar-vector-vector product and add the
    result to a general matrix. The operation is defined as
 
 
-  
-
-
       A <- alpha*x*y :sup:`T` + A
 
 
@@ -68,16 +53,27 @@ ger
    ``y`` is a vector length ``n``.
 
 
+ger (Buffer Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function:: void onemkl::blas::ger(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &a, std::int64_t lda)
 .. container:: section
-   :name: GUID-6953A2E5-0065-425C-986B-15966C793067
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -128,11 +124,9 @@ ger
 
 
 .. container:: section
-   :name: GUID-E2A13688-1D12-4DD0-9752-3557E980ACC0
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -140,15 +134,111 @@ ger
       Buffer holding the updated matrix ``A``.
 
 
-.. container:: familylinks
+ger (USM Version)
+-----------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::ger(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      m
+         Number of rows of ``A``. Must be at least zero.
+
+
+      n
+         Number of columns of ``A``. Must be at least zero.
+
 
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``m`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      a
+         Pointer to input matrix ``A``. Must have size at least
+         ``lda``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``m``, and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated matrix ``A``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/gerc.rst b/docs/domains/blas/gerc.rst
index 5a9c772ac..577da3c89 100644
--- a/docs/domains/blas/gerc.rst
+++ b/docs/domains/blas/gerc.rst
@@ -1,4 +1,4 @@
-.. _gerc:
+.. _onemkl_blas_gerc:
 
 gerc
 ====
@@ -10,16 +10,6 @@ gerc
    Computes a rank-1 update (conjugated) of a general complex matrix.
 
 
-   .. container:: section
-      :name: GUID-5A1B0292-28F6-45EB-95C4-FDA03D8D5062
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void gerc(queue &exec_queue, std::int64_t m,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &y, std::int64_t incy, buffer<T,1> &a, std::int64_t      lda)
 
       ``gerc`` supports the following precisions.
 
@@ -35,15 +25,13 @@ gerc
 
 
 .. container:: section
-   :name: GUID-6CB627E5-A9C7-488D-8366-E7944A5C889E
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The gerc routines compute a scalar-vector-vector product and add the
+   The ``gerc`` routines compute a scalar-vector-vector product and add the
    result to a general matrix. The operation is defined as
 
 
@@ -68,16 +56,27 @@ gerc
    ``y`` is vector of length ``n``.
 
 
+gerc (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function:: void onemkl::blas::gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &a, std::int64_t lda)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -128,27 +127,122 @@ gerc
 
 
 .. container:: section
-   :name: GUID-48944ED2-C10F-4B64-A91A-C9050AD24A92
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    a
-      Buffer holding the updated matrix *A*.
+      Buffer holding the updated matrix ``A``.
 
 
-.. container:: familylinks
+gerc (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      m
+         Number of rows of ``A``. Must be at least zero.
+
+
+      n
+         Number of columns of ``A``. Must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
 
+
+      x
+         Pointer to the input vector ``x``. The array holding input
+         vector ``x`` must be of size at least (1 + (``m`` -
+         1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to the input/output vector ``y``. The array holding the
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A``\ ust have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``m``, and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated matrix ``A``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/geru.rst b/docs/domains/blas/geru.rst
index 121e2d13c..2bce5cfce 100644
--- a/docs/domains/blas/geru.rst
+++ b/docs/domains/blas/geru.rst
@@ -1,4 +1,4 @@
-.. _geru:
+.. _onemkl_blas_geru:
 
 geru
 ====
@@ -10,16 +10,6 @@ geru
    Computes a rank-1 update (unconjugated) of a general complex matrix.
 
 
-   .. container:: section
-      :name: GUID-5942D28E-EDD6-4759-B19E-FBB51F35125B
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void geru(queue &exec_queue, std::int64_t m,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &y, std::int64_t incy, buffer<T,1> &a, std::int64_t      lda)
 
       ``geru`` supports the following precisions.
 
@@ -35,21 +25,16 @@ geru
 
 
 .. container:: section
-   :name: GUID-75ECE219-BA77-48E8-B13B-FB504DD60CD4
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The geru routines routines compute a scalar-vector-vector product and
+   The ``geru`` routines routines compute a scalar-vector-vector product and
    add the result to a general matrix. The operation is defined as
 
 
-  
-
-
       A <- alpha*x*y :sup:`T` + A
 
 
@@ -68,16 +53,27 @@ geru
    ``y`` is a vector of length ``n``.
 
 
+geru (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::geru(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &a, std::int64_t lda)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -128,11 +124,9 @@ geru
 
 
 .. container:: section
-   :name: GUID-6E9315E9-DDCF-485D-8BDF-AB4BF8448BE1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -140,15 +134,112 @@ geru
       Buffer holding the updated matrix ``A``.
 
 
-.. container:: familylinks
+geru (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::geru(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      m
+         Number of rows of ``A``. Must be at least zero.
+
+
+      n
+         Number of columns of ``A``. Must be at least zero.
+
 
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to the input vector ``x``. The array holding input
+         vector ``x`` must be of size at least (1 + (``m`` -
+         1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``m``, and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated matrix ``A``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hbmv.rst b/docs/domains/blas/hbmv.rst
index e23481a2a..48b60d504 100644
--- a/docs/domains/blas/hbmv.rst
+++ b/docs/domains/blas/hbmv.rst
@@ -1,4 +1,4 @@
-.. _hbmv:
+.. _onemkl_blas_hbmv:
 
 hbmv
 ====
@@ -10,17 +10,6 @@ hbmv
    Computes a matrix-vector product using a Hermitian band matrix.
 
 
-   .. container:: section
-      :name: GUID-F5FF420B-922B-4552-8F55-6EBCA7177881
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void hbmv(queue &exec_queue, uplo upper_lower,      std::int64_t n, std::int64_t k, T alpha, buffer<T,1> &a,      std::int64_t lda, buffer<T,1> &x, std::int64_t incx, T beta,      buffer<T,1> &y, std::int64_t incy)
-
       ``hbmv`` supports the following precisions.
 
 
@@ -35,22 +24,17 @@ hbmv
 
 
 .. container:: section
-   :name: GUID-8AB4BAC9-8124-4B52-8C15-1BC673820EB9
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The hbmv routines compute a scalar-matrix-vector product and add the
+   The ``hbmv`` routines compute a scalar-matrix-vector product and add the
    result to a scalar-vector product, with a Hermitian band matrix. The
    operation is defined as
 
 
-  
-
-
       y <- alpha*A*x + beta*y
 
 
@@ -67,23 +51,33 @@ hbmv
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+hbmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
 
    n
@@ -138,11 +132,9 @@ hbmv
 
 
 .. container:: section
-   :name: GUID-7261182A-450B-46F5-8C61-7133597D3530
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -150,15 +142,122 @@ hbmv
       Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+hbmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
 
+      k
+         Number of super-diagonals of the matrix ``A``. Must be at least
+         zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         Pointer to the input matrix ``A``. The array holding input
+         matrix ``A`` must have size at least ``lda``\ \*\ ``n``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least (``k`` +
+         1), and positive.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``m`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      beta
+         Scaling factor for vector ``y``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hemm.rst b/docs/domains/blas/hemm.rst
index bc597e228..6b9729998 100644
--- a/docs/domains/blas/hemm.rst
+++ b/docs/domains/blas/hemm.rst
@@ -1,4 +1,4 @@
-.. _hemm:
+.. _onemkl_blas_hemm:
 
 hemm
 ====
@@ -11,18 +11,8 @@ hemm
    and one is general.
 
 
-   .. container:: section
-      :name: GUID-F06C86BA-4F57-4608-B0D7-F7B920F867D7
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void hemm(queue &exec_queue, side left_right,      uplo upper_lower, std::int64_t m, std::int64_t n, T alpha,      buffer<T,1> &a, std::int64_t lda, buffer<T,1> &b, std::int64_t      ldb, T beta, buffer<T,1> &c, std::int64_t ldc)
 
-      hemm supports the following precisions:
+      ``hemm`` supports the following precisions:
 
 
       .. list-table:: 
@@ -36,15 +26,13 @@ hemm
 
 
 .. container:: section
-   :name: GUID-835E7F58-406E-444F-9DFD-121B84C22284
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The hemm routines compute a scalar-matrix-matrix product and add the
+   The ``hemm`` routines compute a scalar-matrix-matrix product and add the
    result to a scalar-matrix product, where one of the matrices in the
    multiplication is Hermitian. The argument ``left_right`` determines
    if the Hermitian matrix, ``A``, is on the left of the multiplication
@@ -53,18 +41,11 @@ hemm
    defined as
 
 
-  
-
-
       C <- alpha*A*B + beta*C
 
-
    or
 
 
-  
-
-
       C <- alpha*B*A + beta*C
 
 
@@ -81,31 +62,40 @@ hemm
    ``B`` and ``C`` are ``m``-by-``n`` matrices.
 
 
+hemm (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, T beta, sycl::buffer<T,1> &c, std::int64_t ldc)
 .. container:: section
-   :name: GUID-922C5F92-38B2-457B-B6C7-3CDD0531F97D
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    left_right
       Specifies whether ``A`` is on the left side of the multiplication
-      (``side::left``) or on the right side (``side::right``). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details.
+
 
 
    uplo
       Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      triangle. See :ref:`onemkl_datatypes` for more details.
+
 
 
    m
@@ -167,19 +157,11 @@ hemm
 
 
 .. container:: section
-   :name: GUID-94385C78-968D-4C03-AA5C-7379D5607800
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
-
-   
-       
-
-
-
    c
       Output buffer, overwritten by ``alpha``\ \*\ ``A``\ \*\ ``B`` +
       ``beta``\ \*\ ``C`` (``left_right`` = ``side::left``) or
@@ -188,11 +170,9 @@ hemm
 
 
 .. container:: section
-   :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24
 
 
    .. rubric:: Notes
-      :name: notes
       :class: sectiontitle
 
 
@@ -200,15 +180,152 @@ hemm
    calling ``hemm``.
 
 
-.. container:: familylinks
+hemm (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      left_right
+         Specifies whether ``A`` is on the left side of the
+         multiplication (``side::left``) or on the right side
+         (``side::right``). See :ref:`onemkl_datatypes` for more details.
+
+
+
+      uplo
+         Specifies whether ``A``'s data is stored in its upper or lower
+         triangle. See :ref:`onemkl_datatypes` for more details.
+
+
+
+      m
+         Specifies the number of rows of the matrix ``B`` and ``C``.
+
+
+         The value of ``m`` must be at least zero.
 
+
+      n
+         Specifies the number of columns of the matrix ``B`` and ``C``.
+
+
+         The value of ``n`` must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-matrix product.
+
+
+      a
+         Pointer to input matrix ``A``. Must have size at least
+         ``lda``\ \*\ ``m`` if ``A`` is on the left of the
+         multiplication, or ``lda``\ \*\ ``n`` if ``A`` is on the right.
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``m`` if ``A`` is
+         on the left of the multiplication, or at least ``n`` if ``A``
+         is on the right. Must be positive.
+
+
+      b
+         Pointer to input matrix ``B``. Must have size at least
+         ``ldb``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldb
+         Leading dimension of ``B``. Must be positive and at least
+         ``m``.
+
+
+      beta
+         Scaling factor for matrix ``C``.
+
+
+      c
+         Pointer to input/output matrix ``C``. Must have size at least
+         ``ldc``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldc
+         Leading dimension of ``C``. Must be positive and at least
+         ``m``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Pointer to the output matrix, overwritten by
+         ``alpha``\ \*\ ``A``\ \*\ ``B`` + ``beta``\ \*\ ``C``
+         (``left_right`` = ``side::left``) or
+         ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C``
+         (``left_right`` = ``side::right``).
+
+
+   .. container:: section
+
+
+      .. rubric:: Notes
+         :class: sectiontitle
+
+
+      If ``beta`` = 0, matrix ``C`` does not need to be initialized
+      before calling ``hemm``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/hemv.rst b/docs/domains/blas/hemv.rst
index 289cdc0a8..b701d91af 100644
--- a/docs/domains/blas/hemv.rst
+++ b/docs/domains/blas/hemv.rst
@@ -1,4 +1,4 @@
-.. _hemv:
+.. _onemkl_blas_hemv:
 
 hemv
 ====
@@ -10,16 +10,6 @@ hemv
    Computes a matrix-vector product using a Hermitian matrix.
 
 
-   .. container:: section
-      :name: GUID-152B72DC-F67F-4D7D-96DA-67AE6AD41718
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void hemv(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &a, std::int64_t lda,      buffer<T,1> &x, std::int64_t incx, T beta, buffer<T,1> &y,      std::int64_t incy)
 
       ``hemv`` supports the following precisions.
 
@@ -35,22 +25,17 @@ hemv
 
 
 .. container:: section
-   :name: GUID-0E4AE01A-4FE8-42AC-B236-409F4DD48F88
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The hemv routines compute a scalar-matrix-vector product and add the
+   The ``hemv`` routines compute a scalar-matrix-vector product and add the
    result to a scalar-vector product, with a Hermitian matrix. The
    operation is defined as
 
 
-  
-
-
       y <- alpha*A*x + beta*y
 
 
@@ -66,23 +51,33 @@ hemv
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+hemv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether *A* is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
 
    n
@@ -132,11 +127,9 @@ hemv
 
 
 .. container:: section
-   :name: GUID-66566E59-9A52-4207-B123-AF45FA3A0FBC
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -144,15 +137,117 @@ hemv
       Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+hemv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
 
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``m``, and
+         positive.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      beta
+         Scaling factor for vector ``y``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/her.rst b/docs/domains/blas/her.rst
index a13196fb6..5fc0f438d 100644
--- a/docs/domains/blas/her.rst
+++ b/docs/domains/blas/her.rst
@@ -1,4 +1,4 @@
-.. _her:
+.. _onemkl_blas_her:
 
 her
 ===
@@ -10,16 +10,6 @@ her
    Computes a rank-1 update of a Hermitian matrix.
 
 
-   .. container:: section
-      :name: GUID-252B1D4A-30C7-4678-9793-6A0C90DEB04A
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void her(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &a, std::int64_t lda)
 
       ``her`` supports the following precisions.
 
@@ -35,21 +25,16 @@ her
 
 
 .. container:: section
-   :name: GUID-A06B7C00-CFD6-4A01-9739-19093823B58E
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The her routines compute a scalar-vector-vector product and add the
+   The ``her`` routines compute a scalar-vector-vector product and add the
    result to a Hermitian matrix. The operation is defined as
 
 
-  
-
-
       A <- alpha*x*x :sup:`H` + A
 
 
@@ -65,23 +50,33 @@ her
    ``x`` is a vector of length ``n``.
 
 
+her (Buffer Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function:: void onemkl::blas::her(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &a, std::int64_t lda)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether *A* is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
 
    n
@@ -116,33 +111,123 @@ her
 
 
 .. container:: section
-   :name: GUID-89A60481-0763-4608-B346-3CC746467F28
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    a
-      Buffer holding the updated upper triangular part of theHermitian
+      Buffer holding the updated upper triangular part of the Hermitian
       matrix ``A`` if ``upper_lower = upper`` or the updated
-      lowertriangular part of the Hermitian matrix ``A`` if
+      lower triangular part of the Hermitian matrix ``A`` if
       ``upper_lower = lower``.
 
 
-      The imaginary parts of the diagonal elementsare set to zero.
+      The imaginary parts of the diagonal elements are set to zero.
 
 
-.. container:: familylinks
+her (USM Version)
+-----------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::her(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, std::int64_t lda, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
-.. container::
 
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``n``, and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the Hermitian
+         matrix ``A`` if ``upper_lower = upper`` or the updated
+         lower triangular part of the Hermitian matrix ``A`` if
+         ``upper_lower = lower``.
+
+
+         The imaginary parts of the diagonal elements are set to zero.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/her2.rst b/docs/domains/blas/her2.rst
index 0c100195c..ae8762e87 100644
--- a/docs/domains/blas/her2.rst
+++ b/docs/domains/blas/her2.rst
@@ -1,4 +1,4 @@
-.. _her2:
+.. _onemkl_blas_her2:
 
 her2
 ====
@@ -10,16 +10,6 @@ her2
    Computes a rank-2 update of a Hermitian matrix.
 
 
-   .. container:: section
-      :name: GUID-4BED3537-E900-4260-A6EB-2F42CB1D3AFB
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void her2(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &y, std::int64_t incy, buffer<T,1> &a, std::int64_t      lda)
 
       ``her2`` supports the following precisions.
 
@@ -35,21 +25,16 @@ her2
 
 
 .. container:: section
-   :name: GUID-2B939041-9BCC-4AE8-A31D-2CFCA67B9B6A
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The her2 routines compute two scalar-vector-vector products and add
+   The ``her2`` routines compute two scalar-vector-vector products and add
    them to a Hermitian matrix. The operation is defined as
 
 
-  
-
-
       A <- alpha*x*y :sup:`H` + conjg(alpha)*y*x :sup:`H` + A
 
 
@@ -65,23 +50,33 @@ her2
    ``x`` and ``y`` are vectors or length ``n``.
 
 
+her2 (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function:: void onemkl::blas::her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &a, std::int64_t lda)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether *A* is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
 
    n
@@ -127,33 +122,135 @@ her2
 
 
 .. container:: section
-   :name: GUID-34B3837B-4980-458B-AC3A-EEE5F635834C
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    a
-      Buffer holding the updated upper triangular part of theHermitian
+      Buffer holding the updated upper triangular part of the Hermitian
       matrix ``A`` if ``upper_lower = upper``, or the updated
-      lowertriangular part of the Hermitian matrix ``A`` if
+      lower triangular part of the Hermitian matrix ``A`` if
       ``upper_lower = lower``.
 
 
-      The imaginary parts of the diagonal elementsare set to zero.
+      The imaginary parts of the diagonal elements are set to zero.
 
 
-.. container:: familylinks
+her2 (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+
+      n
+         Number of columns of ``A``. Must be at least zero.
+
 
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``n``, and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the Hermitian
+         matrix ``A`` if ``upper_lower = upper``, or the updated
+         lower triangular part of the Hermitian matrix ``A`` if
+         ``upper_lower = lower``.
+
+
+         The imaginary parts of the diagonal elements are set to zero.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/her2k.rst b/docs/domains/blas/her2k.rst
index 98e9cac4f..1a510145f 100644
--- a/docs/domains/blas/her2k.rst
+++ b/docs/domains/blas/her2k.rst
@@ -1,4 +1,4 @@
-.. _her2k:
+.. _onemkl_blas_her2k:
 
 her2k
 =====
@@ -10,18 +10,8 @@ her2k
    Performs a Hermitian rank-2k update.
 
 
-   .. container:: section
-      :name: GUID-1839F1B0-EFE0-40A4-901E-53E7F9B395C2
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
 
-      .. cpp:function::  void her2k(queue &exec_queue, uplo upper_lower,      transpose trans, std::int64_t n, std::int64_t k, T alpha,      buffer<T,1> &a, std::int64_t lda, buffer<T,1> &b, std::int64_t      ldb, T_real beta, buffer<T,1> &c, std::int64_t ldc)
-
-      her2k supports the following precisions:
+      ``her2k`` supports the following precisions:
 
 
       .. list-table:: 
@@ -38,22 +28,17 @@ her2k
 
 
 .. container:: section
-   :name: GUID-6DDD93FE-028E-400C-BBD0-CA13132FAC35
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The her2k routines perform a rank-2k update of an ``n`` x ``n``
+   The ``her2k`` routines perform a rank-2k update of an ``n`` x ``n``
    Hermitian matrix ``C`` by general matrices ``A`` and ``B``. If
    ``trans`` = ``transpose::nontrans``. The operation is defined as
 
 
-  
-
-
       C <- alpha*A*B :sup:`H` + conjg(alpha)*B*A :sup:`H` + beta*C
 
 
@@ -63,9 +48,6 @@ her2k
    If ``trans`` = ``transpose::conjtrans``, the operation is defined as:
 
 
-  
-
-
       C <- alpha*B*A :sup:`H` + conjg(alpha)*A*B :sup:`H` + beta*C
 
 
@@ -84,24 +66,34 @@ her2k
    The inner dimension of both matrix multiplications is ``k``.
 
 
+her2k (Buffer Version)
+----------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function:: void onemkl::blas::her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, T_real beta, sycl::buffer<T,1> &c, std::int64_t ldc)
 .. container:: section
-   :name: GUID-54538396-B04D-4A2A-8A7D-E503A6F815AD
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
       Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      triangle. See :ref:`onemkl_datatypes` for more details.
+
 
 
    trans
@@ -172,11 +164,9 @@ her2k
 
 
 .. container:: section
-   :name: GUID-48D39D42-B29F-4428-A588-9058570B5D5E
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -184,15 +174,140 @@ her2k
       Output buffer, overwritten by the updated ``C`` matrix.
 
 
-.. container:: familylinks
+her2k (USM Version)
+-------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T_real beta, T* c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A``'s data is stored in its upper or lower
+         triangle. See :ref:`onemkl_datatypes` for more details.
+
+
+
+      trans
+         Specifies the operation to apply, as described above. Supported
+         operations are ``transpose::nontrans`` and
+         ``transpose::conjtrans``.
+
+
+      n
+         The number of rows and columns in ``C``. The value of ``n``
+         must be at least zero.
 
+
+      k
+         The inner dimension of matrix multiplications. The value of
+         ``k`` must be at least equal to zero.
+
+
+      alpha
+         Complex scaling factor for the rank-2\ ``k`` update.
+
+
+      a
+         Pointer to input matrix ``A``. If ``trans`` =
+         ``transpose::nontrans``, ``A`` is an ``n``-by-``k`` matrix so
+         the array ``a`` must have size at least ``lda``\ \*\ ``k``.
+         Otherwise, ``A`` is an ``k``-by-``n`` matrix so the array ``a``
+         must have size at least ``lda``\ \*\ ``n``. See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``n`` if ``trans``
+         = ``transpose::nontrans``, and at least ``k`` otherwise. Must
+         be positive.
+
+
+      beta
+         Real scaling factor for matrix ``C``.
+
+
+      b
+         Pointer to input matrix ``B``. If ``trans`` =
+         ``transpose::nontrans``, ``B`` is an ``k``-by-``n`` matrix so
+         the array ``b`` must have size at least ``ldb``\ \*\ ``n``.
+         Otherwise, ``B`` is an ``n``-by-``k`` matrix so the array ``b``
+         must have size at least ``ldb``\ \*\ ``k``. See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldb
+         Leading dimension of ``B``. Must be at least ``k`` if ``trans``
+         = ``transpose::nontrans``, and at least ``n`` otherwise. Must
+         be positive.
+
+
+      c
+         Pointer to input/output matrix ``C``. Must have size at least
+         ``ldc``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldc
+         Leading dimension of ``C``. Must be positive and at least
+         ``n``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Pointer to the output matrix, overwritten by the updated ``C``
+         matrix.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/herk.rst b/docs/domains/blas/herk.rst
index 03b510a1f..1b1e806c3 100644
--- a/docs/domains/blas/herk.rst
+++ b/docs/domains/blas/herk.rst
@@ -1,4 +1,4 @@
-.. _herk:
+.. _onemkl_blas_herk:
 
 herk
 ====
@@ -10,18 +10,8 @@ herk
    Performs a Hermitian rank-k update.
 
 
-   .. container:: section
-      :name: GUID-407B8203-A28D-468B-BA79-87FA865E75A2
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
 
-      .. cpp:function::  void herk(queue &exec_queue, uplo upper_lower,      transpose trans, std::int64_t n, std::int64_t k, T_real alpha,      buffer<T,1> &a, std::int64_t lda, T_real beta, buffer<T,1> &c,      std::int64_t ldc)
-
-      herk supports the following precisions:
+      ``herk`` supports the following precisions:
 
 
       .. list-table:: 
@@ -38,19 +28,14 @@ herk
 
 
 .. container:: section
-   :name: GUID-539B4E63-9CDF-4834-999A-4133CE5DE1E5
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The herk routines compute a rank-``k`` update of a Hermitian matrix
-   *C* by a general matrix ``A``. The operation is defined as:
-
-
-  
+   The ``herk`` routines compute a rank-``k`` update of a Hermitian matrix
+   ``C`` by a general matrix ``A``. The operation is defined as:
 
 
       C <- alpha*op(A)*op(A) :sup:`H` + beta*C
@@ -71,29 +56,38 @@ herk
    Here op(``A``) is ``n`` x ``k``, and ``C`` is ``n`` x ``n``.
 
 
+herk (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T_real alpha, sycl::buffer<T,1> &a, std::int64_t lda, T_real beta, sycl::buffer<T,1> &c, std::int64_t ldc)
+
 .. container:: section
-   :name: GUID-7B880A06-4E53-4DE9-B0E6-D70673CF2638
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
       Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      triangle. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
+      Specifies op(``A``), the transposition operation applied to ``A``. See
       :ref:`onemkl_datatypes` for more
       details. Supported operations are ``transpose::nontrans`` and
       ``transpose::conjtrans``.
@@ -147,11 +141,9 @@ herk
 
 
 .. container:: section
-   :name: GUID-05309970-DEC8-4D87-90AA-958FC101E119
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -161,15 +153,127 @@ herk
       The imaginary parts of the diagonal elements are set to zero.
 
 
-.. container:: familylinks
+herk (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T_real alpha, const T* a, std::int64_t lda, T_real beta, T* c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A``'s data is stored in its upper or lower
+         triangle. See :ref:`onemkl_datatypes` for more details.
+
+
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details. Supported operations are ``transpose::nontrans``
+         and ``transpose::conjtrans``.
+
 
+      n
+         The number of rows and columns in ``C``.The value of ``n`` must
+         be at least zero.
+
+
+      k
+         Number of columns in op(``A``).
+
+
+         The value of ``k`` must be at least zero.
+
+
+      alpha
+         Real scaling factor for the rank-``k`` update.
+
+
+      a
+         Pointer to input matrix ``A``. If ``trans`` =
+         ``transpose::nontrans``, ``A`` is an ``n``-by-``k`` matrix so
+         the array ``a`` must have size at least ``lda``\ \*\ ``k``.
+         Otherwise, ``A`` is an ``k``-by-``n`` matrix so the array ``a``
+         must have size at least ``lda``\ \*\ ``n``. See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``n`` if ``A`` is
+         not transposed, and at least ``k`` if ``A`` is transposed. Must
+         be positive.
+
+
+      beta
+         Real scaling factor for matrix ``C``.
+
+
+      c
+         Pointer to input/output matrix ``C``. Must have size at least
+         ``ldc``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldc
+         Leading dimension of ``C``. Must be positive and at least
+         ``n``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Pointer to the output matrix, overwritten by
+         ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` +
+         ``beta``\ \*\ ``C``. The imaginary parts of the diagonal
+         elements are set to zero.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/hpmv.rst b/docs/domains/blas/hpmv.rst
index f71e4b392..f591a36fe 100644
--- a/docs/domains/blas/hpmv.rst
+++ b/docs/domains/blas/hpmv.rst
@@ -1,4 +1,4 @@
-.. _hpmv:
+.. _onemkl_blas_hpmv:
 
 hpmv
 ====
@@ -10,16 +10,6 @@ hpmv
    Computes a matrix-vector product using a Hermitian packed matrix.
 
 
-   .. container:: section
-      :name: GUID-C6E4A4A7-5CBE-46ED-A021-8FEAABAA2E93
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void hpmv(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &a, buffer<T,1> &x,      std::int64_t incx, T beta, buffer<T,1> &y, std::int64_t incy)
 
       ``hpmv`` supports the following precisions.
 
@@ -35,22 +25,17 @@ hpmv
 
 
 .. container:: section
-   :name: GUID-A95C32C5-0371-429B-847C-4EE29FD9C480
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The hpmv routines compute a scalar-matrix-vector product and add the
+   The ``hpmv`` routines compute a scalar-matrix-vector product and add the
    result to a scalar-vector product, with a Hermitian packed matrix.
    The operation is defined as
 
 
-  
-
-
       y <- alpha*A*x + beta*y
 
 
@@ -66,23 +51,34 @@ hpmv
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+hpmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &a, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether *A* is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
 
    n
@@ -131,11 +127,9 @@ hpmv
 
 
 .. container:: section
-   :name: GUID-416B82CD-C5B8-472A-8347-04997EA6D6E6
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -143,15 +137,116 @@ hpmv
       Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+hpmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
 
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+         The imaginary parts of the diagonal elements need not be set
+         and are assumed to be zero.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      beta
+         Scaling factor for vector ``y``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hpr.rst b/docs/domains/blas/hpr.rst
index 1c0f38a0c..f4cb47c0b 100644
--- a/docs/domains/blas/hpr.rst
+++ b/docs/domains/blas/hpr.rst
@@ -1,4 +1,4 @@
-.. _hpr:
+.. _onemkl_blas_hpr:
 
 hpr
 ===
@@ -10,16 +10,6 @@ hpr
    Computes a rank-1 update of a Hermitian packed matrix.
 
 
-   .. container:: section
-      :name: GUID-61DC4DBA-9357-4129-B8A3-931E2E7335D4
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void hpr(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &a)
 
       ``hpr`` supports the following precisions.
 
@@ -32,24 +22,17 @@ hpr
          * -  ``std::complex<double>`` 
 
 
-
-
 .. container:: section
-   :name: GUID-02B8128C-02CE-4D5C-BE5D-DFD088C90475
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The hpr routines compute a scalar-vector-vector product and add the
+   The ``hpr`` routines compute a scalar-vector-vector product and add the
    result to a Hermitian packed matrix. The operation is defined as
 
 
-  
-
-
       A <- alpha*x*x :sup:`H` + A
 
 
@@ -65,23 +48,33 @@ hpr
    ``x`` is a vector of length ``n``.
 
 
+hpr (Buffer Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &a)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -111,37 +104,125 @@ hpr
 
 
       The imaginary part of the diagonal elements need not be set and
-      are assumed to be zero
+      are assumed to be zero.
 
 
 .. container:: section
-   :name: GUID-7261182A-450B-46F5-8C61-7133597D3530
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    a
-      Buffer holding the updated upper triangularpart of the Hermitian
+      Buffer holding the updated upper triangular part of the Hermitian
       matrix ``A`` if ``upper_lower =upper``, or the updated lower
-      triangular part of theHermitian matrix ``A`` if
+      triangular part of the Hermitian matrix ``A`` if
       ``upper_lower =lower``.
 
 
-      The imaginary parts of the diagonal elements are set tozero.
+      The imaginary parts of the diagonal elements are set to zero.
 
 
-.. container:: familylinks
+hpr (USM Version)
+-----------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``-1))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+         The imaginary part of the diagonal elements need not be set and
+         are assumed to be zero.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the Hermitian
+         matrix ``A`` if ``upper_lower =upper``, or the updated lower
+         triangular part of the Hermitian matrix ``A`` if
+         ``upper_lower =lower``.
+
+
+         The imaginary parts of the diagonal elements are set to zero.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hpr2.rst b/docs/domains/blas/hpr2.rst
index bfe83d4b6..4537428cb 100644
--- a/docs/domains/blas/hpr2.rst
+++ b/docs/domains/blas/hpr2.rst
@@ -1,4 +1,4 @@
-.. _hpr2:
+.. _onemkl_blas_hpr2:
 
 hpr2
 ====
@@ -10,16 +10,6 @@ hpr2
    Performs a rank-2 update of a Hermitian packed matrix.
 
 
-   .. container:: section
-      :name: GUID-9F8EB534-6520-4470-85AC-6AD8F2467AD4
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void hpr2(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &y, std::int64_t incy, buffer<T,1> &a)
 
       ``hpr2`` supports the following precisions.
 
@@ -35,21 +25,16 @@ hpr2
 
 
 .. container:: section
-   :name: GUID-16FE1EDC-1A72-4BAB-8AFF-C316C4CE5838
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The hpr2 routines compute two scalar-vector-vector products and add
+   The ``hpr2`` routines compute two scalar-vector-vector products and add
    them to a Hermitian packed matrix. The operation is defined as
 
 
-  
-
-
       A <- alpha*x*y :sup:`H` + conjg(alpha)*y*x :sup:`H` + A
 
 
@@ -65,23 +50,33 @@ hpr2
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+hpr2 (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &a)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether *A* is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -126,38 +121,133 @@ hpr2
 
 
 .. container:: section
-   :name: GUID-9A77A2E0-F610-44EE-A3EE-81327B90A3FD
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
-   **sycl:**
-       
-
-
-
    a
-      Buffer holding the updated upper triangularpart of the Hermitian
+      Buffer holding the updated upper triangular part of the Hermitian
       matrix ``A`` if ``upper_lower =upper``, or the updated lower
-      triangular part of theHermitian matrix ``A`` if
+      triangular part of the Hermitian matrix ``A`` if
       ``upper_lower =lower``.
 
 
-      The imaginary parts of the diagonal elements are set tozero.
+      The imaginary parts of the diagonal elements are set to zero.
 
 
-.. container:: familylinks
+hpr2 (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
 
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``-1))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+         The imaginary parts of the diagonal elements need not be set
+         and are assumed to be zero.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the Hermitian
+         matrix ``A`` if ``upper_lower =upper``, or the updated lower
+         triangular part of the Hermitian matrix ``A`` if
+         ``upper_lower =lower``.
+
+
+         The imaginary parts of the diagonal elements are set to zero.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/iamax.rst b/docs/domains/blas/iamax.rst
index 1678b7279..5ab2d436d 100644
--- a/docs/domains/blas/iamax.rst
+++ b/docs/domains/blas/iamax.rst
@@ -1,4 +1,4 @@
-.. _iamax:
+.. _onemkl_blas_iamax:
 
 iamax
 =====
@@ -11,18 +11,8 @@ iamax
    vector.
 
 
-   .. container:: section
-      :name: GUID-D1ABF76D-DB39-4C23-A217-EA2C7C6D1325
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
 
-
-      .. cpp:function::  void iamax(queue &exec_queue, std::int64_t n,      buffer<T, 1> &x, std::int64_t incx, buffer<std::int64_t, 1>      &result)
-
-      iamax supports the following precisions.
+      ``iamax`` supports the following precisions.
 
 
       .. list-table:: 
@@ -38,15 +28,13 @@ iamax
 
 
 .. container:: section
-   :name: GUID-822D7950-256E-406D-9305-61F761080E69
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The iamax routines return an index ``i``\ such that ``x``\ [``i``]
+   The ``iamax`` routines return an index ``i``\ such that ``x``\ [``i``]
    has the maximum absolute value of all elements in vector ``x`` (real
    variants), or such that ``|Re(x[i])| + |Im(x[i])|`` is maximal
    (complex variants).
@@ -56,7 +44,6 @@ iamax
 
 
       .. rubric:: Note
-         :name: note
          :class: NoteTipHead
 
 
@@ -75,16 +62,28 @@ iamax
    index of the first ``NaN``.
 
 
+iamax (Buffer Version)
+----------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<T, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result)
+
 .. container:: section
-   :name: GUID-CE43FE84-2066-4095-BB7E-0691CD045443
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -104,11 +103,9 @@ iamax
 
 
 .. container:: section
-   :name: ARGUMENTS_EC9F05BE9B09443F8BC59207D5EA40F1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -117,16 +114,80 @@ iamax
       is stored.
 
 
+iamax (USM Version)
+-------------------
 
-.. container:: familylinks
+.. container::
 
+   .. container:: section
 
-   .. container:: parentlink
 
+      .. rubric:: Syntax
+         :class: sectiontitle
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
 
+      .. container:: dlsyntaxpara
 
-.. container::
 
+         .. cpp:function::  sycl::event onemkl::blas::iamax(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      n
+         The number of elements in vector ``x``.
+
+
+      x
+         The pointer to the input vector ``x``. The array holding the
+         input vector ``x`` must be of size at least (1 + (``n`` -
+         1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         The stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      result
+         The pointer to where the zero-based index ``i`` of the maximal
+         element is stored.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/iamin.rst b/docs/domains/blas/iamin.rst
index ca5ea696f..509b575ef 100644
--- a/docs/domains/blas/iamin.rst
+++ b/docs/domains/blas/iamin.rst
@@ -1,4 +1,4 @@
-.. _iamin:
+.. _onemkl_blas_iamin:
 
 iamin
 =====
@@ -10,16 +10,6 @@ iamin
    Finds the index of the element with the smallest absolute value.
 
 
-   .. container:: section
-      :name: GUID-5D077B60-17B5-4961-AFF7-20D78BFB2A07
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void iamin(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<std::int64_t,1>      &result)
 
       ``iamin`` supports the following precisions.
 
@@ -37,15 +27,13 @@ iamin
 
 
 .. container:: section
-   :name: GUID-A820CE7B-E983-4D8F-A73A-753FD95BD507
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The iamin routines return an index ``i`` such that ``x``\ [``i``] has
+   The ``iamin`` routines return an index ``i`` such that ``x``\ [``i``] has
    the minimum absolute value of all elements in vector ``x`` (real
    variants), or such that \|Re(``x``\ [``i``])\| +
    \|Im(``x``\ [``i``])\| is maximal (complex variants).
@@ -55,7 +43,6 @@ iamin
 
 
       .. rubric:: Note
-         :name: note
          :class: NoteTipHead
 
 
@@ -74,16 +61,28 @@ iamin
    index of the first ``NaN``.
 
 
+iamin (Buffer Version)
+----------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<std::int64_t,1> &result)
+
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -103,11 +102,9 @@ iamin
 
 
 .. container:: section
-   :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -116,15 +113,75 @@ iamin
       will be stored.
 
 
-.. container:: familylinks
+iamin (USM Version)
+-------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::iamin(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
 
+
+      queue
+         The queue where the routine should be executed.
+
+
+      n
+         Number of elements in vector ``x``.
+
+
+      x
+         The pointer to input vector ``x``. The array holding input
+         vector ``x`` must be of size at least (1 + (``n`` -
+         1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector x.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      result
+         Pointer to where the zero-based index ``i`` of the minimum
+         element will be stored.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/nrm2.rst b/docs/domains/blas/nrm2.rst
index dfbf2265c..55d8fcccd 100644
--- a/docs/domains/blas/nrm2.rst
+++ b/docs/domains/blas/nrm2.rst
@@ -1,4 +1,4 @@
-.. _nrm2:
+.. _onemkl_blas_nrm2:
 
 nrm2
 ====
@@ -10,16 +10,6 @@ nrm2
    Computes the Euclidean norm of a vector.
 
 
-   .. container:: section
-      :name: GUID-F55A15D5-CCDA-4C44-B86F-C9A5FB36725E
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void nrm2(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T_res,1> &result)
 
       ``nrm2`` supports the following precisions.
 
@@ -42,19 +32,13 @@ nrm2
 
 
 .. container:: section
-   :name: GUID-2BF2C965-5A8C-47F1-9C73-FB0E485CE32A
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The nrm2 routines computes Euclidean norm of a vector
-
-
-  
-
+   The ``nrm2`` routines computes Euclidean norm of a vector
 
       result = ||x||,
 
@@ -65,16 +49,27 @@ nrm2
    ``x`` is a vector of ``n`` elements.
 
 
+nrm2 (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T_res,1> &result)
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -90,15 +85,13 @@ nrm2
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
 .. container:: section
-   :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -107,15 +100,80 @@ nrm2
       stored.
 
 
-.. container:: familylinks
+nrm2 (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::nrm2(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
 
+
+      queue
+         The queue where the routine should be executed.
+
+
+      n
+         Number of elements in vector ``x``.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      result
+         Pointer to where the Euclidean norm of the vector ``x`` will be
+         stored.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/rot.rst b/docs/domains/blas/rot.rst
index 2dc20bdce..c7534ee3f 100644
--- a/docs/domains/blas/rot.rst
+++ b/docs/domains/blas/rot.rst
@@ -1,4 +1,4 @@
-.. _rot:
+.. _onemkl_blas_rot:
 
 rot
 ===
@@ -10,16 +10,6 @@ rot
    Performs rotation of points in the plane.
 
 
-   .. container:: section
-      :name: GUID-9DD44991-6A55-49EE-BD0C-F13406FFBE52
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void rot(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y, std::int64_t      incy, T_scalar c, T_scalar s)
 
       ``rot`` supports the following precisions.
 
@@ -42,15 +32,13 @@ rot
 
 
 .. container:: section
-   :name: GUID-8B7F46D1-5047-4D4C-AF66-F0A3E4AC2BA5
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   Given two vectors ``x`` and ``y`` of ``n`` elements, the rot routines
+   Given two vectors ``x`` and ``y`` of ``n`` elements, the ``rot`` routines
    compute four scalar-vector products and update the input vectors with
    the sum of two of these scalar-vector products as follow:
 
@@ -61,16 +49,28 @@ rot
   
 
 
+rot (Buffer Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::rot(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, T_scalar c, T_scalar s)
+
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -86,7 +86,7 @@ rot
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
    y
@@ -97,7 +97,7 @@ rot
 
 
    incy
-      Stride of vector y.
+      Stride of vector ``y``.
 
 
    c
@@ -109,11 +109,9 @@ rot
 
 
 .. container:: section
-   :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -125,15 +123,103 @@ rot
       Buffer holding updated buffer ``y``.
 
 
-.. container:: familylinks
+rot (USM Version)
+-----------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::rot(sycl::queue &queue, std::int64_t n, T *x, std::int64_t incx, T *y, std::int64_t incy, T_scalar c, T_scalar s, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      n
+         Number of elements in vector ``x``.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
 
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input vector ``y``. The array holding input vector
+         ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      c
+         Scaling factor.
+
+
+      s
+         Scaling factor.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the updated matrix ``x``.
+
+
+      y
+         Pointer to the updated matrix ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/rotg.rst b/docs/domains/blas/rotg.rst
index 3110a60a8..df3f8396c 100644
--- a/docs/domains/blas/rotg.rst
+++ b/docs/domains/blas/rotg.rst
@@ -1,4 +1,4 @@
-.. _rotg:
+.. _onemkl_blas_rotg:
 
 rotg
 ====
@@ -10,16 +10,6 @@ rotg
    Computes the parameters for a Givens rotation.
 
 
-   .. container:: section
-      :name: GUID-E4B6E693-AC8C-4BB3-A197-3EB9E905B925
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void rotg(queue &exec_queue, buffer<T,1> &a,      buffer<T,1> &b, buffer<T_real,1> &c, buffer<T,1> &s)
 
       ``rotg`` supports the following precisions.
 
@@ -42,15 +32,13 @@ rotg
 
 
 .. container:: section
-   :name: GUID-5614B81D-C736-4714-88AB-29B38F9B3589
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   Given the Cartesian coordinates ``(a, b)`` of a point, the rotg
+   Given the Cartesian coordinates ``(a, b)`` of a point, the ``rotg``
    routines return the parameters ``c``, ``s``, ``r``, and ``z``
    associated with the Givens rotation. The parameters ``c`` and ``s``
    define a unitary matrix such that:
@@ -61,16 +49,28 @@ rotg
    1/``c``; otherwise ``z`` is 1.
 
 
+rotg (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::rotg(sycl::queue &queue, sycl::buffer<T,1> &a, sycl::buffer<T,1> &b, sycl::buffer<T_real,1> &c, sycl::buffer<T,1> &s)
+
 .. container:: section
-   :name: GUID-C2003328-15AA-4DF0-A417-40BECCA7DEA3
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed
 
 
@@ -83,11 +83,9 @@ rotg
 
 
 .. container:: section
-   :name: GUID-3B7937E3-2DF7-49A3-8F1E-2C9406BB4E88
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -111,15 +109,87 @@ rotg
       rotation.
 
 
-.. container:: familylinks
+rotg (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::rotg(sycl::queue &queue, T *a, T *b, T_real *c, T *s, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed
+
 
+      a
+         Pointer to the ``x``-coordinate of the point.
+
+
+      b
+         Pointer to the ``y``-coordinate of the point.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the parameter ``r`` associated with the Givens
+         rotation.
+
+
+      b
+         Pointer to the parameter ``z`` associated with the Givens
+         rotation.
+
+
+      c
+         Pointer to the parameter ``c`` associated with the Givens
+         rotation.
+
+
+      s
+         Pointer to the parameter ``s`` associated with the Givens
+         rotation.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/rotm.rst b/docs/domains/blas/rotm.rst
index 4d025c4ea..237abb96c 100644
--- a/docs/domains/blas/rotm.rst
+++ b/docs/domains/blas/rotm.rst
@@ -1,4 +1,4 @@
-.. _rotm:
+.. _onemkl_blas_rotm:
 
 rotm
 ====
@@ -10,16 +10,6 @@ rotm
    Performs modified Givens rotation of points in the plane.
 
 
-   .. container:: section
-      :name: GUID-F8F2E2EB-1704-454D-BE45-C055D6F4E7D6
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void rotm(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y, std::int64_t      incy, buffer<T,1> &param)
 
       ``rotm`` supports the following precisions.
 
@@ -35,11 +25,9 @@ rotm
 
 
 .. container:: section
-   :name: GUID-856650C6-2998-4452-A34A-DF6CB801087D
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -55,16 +43,28 @@ rotm
    transformation matrix.
 
 
+rotm (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &param)
+
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -80,7 +80,7 @@ rotm
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
    y
@@ -91,7 +91,7 @@ rotm
 
 
    incy
-      Stride of vector y.
+      Stride of vector ``y``.
 
 
    param
@@ -102,12 +102,12 @@ rotm
       ``param``\ [0] contains a switch, ``flag``,
 
 
-      ``param``\ [1-4] contain *h\ 11*,\ *h\ 21*, *h\ 12*,\ *h\ 22*
-      respectively, the components ofthe modified Givens transformation
+      ``param``\ [1-4] contain *h\ 11*, \ *h\ 21*, *h\ 12*, and \ *h\ 22*
+      respectively, the components of the modified Givens transformation
       matrix ``H``.
 
 
-      Depending on the values of ``flag``, thecomponents of ``H`` are
+      Depending on the values of ``flag``, the components of ``H`` are
       set as follows:
 
 
@@ -133,11 +133,9 @@ rotm
 
 
 .. container:: section
-   :name: GUID-062D805B-68FF-41F6-8D9A-329C92A77EA3
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -149,22 +147,139 @@ rotm
       Buffer holding updated buffer ``y``.
 
 
-.. container:: familylinks
+rotm (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::rotm(sycl::queue &queue, std::int64_t n, T *x, std::int64_t incx, T *y, std::int64_t incy, T *param, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      n
+         Number of elements in vector ``x``.
+
+
+      x
+         Pointer to the input vector ``x``. The array holding the vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      yparam
+         Pointer to the input vector ``y``. The array holding the vector
+         ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
 
+      param
+         Pointer to an array of size 5. The elements of the ``param``
+         array are:
 
-.. |image0| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee1.png
-.. |image1| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee2.png
-.. |image2| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee3.png
-.. |image3| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee4.png
-.. |image4| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee5.png
+
+         ``param``\ [0] contains a switch, ``flag``,
+
+
+         ``param``\ [1-4] contain *h\ 11*, \ *h\ 21*, *h\ 12*, and \ *h\ 22*
+         respectively, the components of the modified Givens
+         transformation matrix ``H``.
+
+
+         Depending on the values of ``flag``, the components of ``H`` are
+         set as follows:
+
+
+         | ``flag =``\ ``-1.0``:
+         | |image1|
+
+
+         | ``flag =``\ ``0.0``:
+         | |image2|
+
+
+         | ``flag =``\ ``1.0``:
+         | |image3|
+
+
+         | ``flag =``\ ``-2.0``:
+         | |image4|
+
+
+         In the last three cases, the matrix entries of 1.0, -1.0, 0.0
+         are assumed based on the value of ``flag`` and are not required
+         to be set in the ``param`` vector.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the updated array ``x``.
+
+
+      y
+         Pointer to the updated array ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
+.. |image0| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee1.png
+.. |image1| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee2.png
+.. |image2| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee3.png
+.. |image3| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee4.png
+.. |image4| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee5.png
 
diff --git a/docs/domains/blas/rotmg.rst b/docs/domains/blas/rotmg.rst
index 64d6543ea..e89e64cc4 100644
--- a/docs/domains/blas/rotmg.rst
+++ b/docs/domains/blas/rotmg.rst
@@ -1,4 +1,4 @@
-.. _rotmg:
+.. _onemkl_blas_rotmg:
 
 rotmg
 =====
@@ -10,16 +10,6 @@ rotmg
    Computes the parameters for a modified Givens rotation.
 
 
-   .. container:: section
-      :name: GUID-DF41021D-C145-495B-A717-45FB5F36E676
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void rotmg(queue &exec_queue, buffer<T,1> &d1,      buffer<T,1> &d2, buffer<T,1> &x1, buffer<T,1> &y1, buffer<T,1>      &param)
 
       ``rotmg`` supports the following precisions.
 
@@ -35,11 +25,9 @@ rotmg
 
 
 .. container:: section
-   :name: GUID-5525F11C-A739-487E-A7CC-6886A088035D
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
@@ -53,16 +41,28 @@ rotmg
    | |image0|
 
 
+rotmg (Buffer Version)
+----------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::rotmg(sycl::queue &queue, sycl::buffer<T,1> &d1, sycl::buffer<T,1> &d2, sycl::buffer<T,1> &x1, sycl::buffer<T,1> &y1, sycl::buffer<T,1> &param)
+
 .. container:: section
-   :name: GUID-21946B3A-A859-4293-8EE7-965328AA6717
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -85,11 +85,9 @@ rotmg
 
 
 .. container:: section
-   :name: GUID-1C0481DB-BB35-4DB7-941F-649EDAA77C6F
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -102,7 +100,7 @@ rotmg
 
 
    x1
-      Buffer holding the *x*-coordinate of the rotated vector before
+      Buffer holding the ``x``-coordinate of the rotated vector before
       scaling
 
 
@@ -144,22 +142,134 @@ rotmg
       be set in the ``param`` vector.
 
 
-.. container:: familylinks
+rotmg (USM Version)
+-------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::rotmg(sycl::queue &queue, T *d1, T *d2, T *x1, T *y1, T *param, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
 
 
-.. |image0| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee1.png
-.. |image1| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee2.png
-.. |image2| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee3.png
-.. |image3| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee4.png
-.. |image4| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee5.png
+      d1
+         Pointer to the scaling factor for the ``x``-coordinate of the
+         input vector.
+
+
+      d2
+         Pointer to the scaling factor for the ``y``-coordinate of the
+         input vector.
+
+
+      x1
+         Pointer to the ``x``-coordinate of the input vector.
+
+
+      y1
+         Scalar specifying the ``y``-coordinate of the input vector.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      d1
+         Pointer to the first diagonal element of the updated matrix.
+
+
+      d2
+         Pointer to the second diagonal element of the updated matrix.
+
+
+      x1
+         Pointer to the ``x``-coordinate of the rotated vector before
+         scaling
+
+
+      param
+         Pointer to an array of size 5.
+
+
+         The elements of the ``param`` array are:
+
+
+         ``param[0]`` contains a switch, ``flag``. The other array
+         elements ``param[1-4]`` contain the components of the array
+         ``H``: ``h``\ :sub:`11`, ``h``\ :sub:`21`, ``h``\ :sub:`12`,
+         and ``h``\ :sub:`22`, respectively.
+
+
+         Depending on the values of ``flag``, the components of ``H``
+         are set as follows:
+
+
+         | ``flag =``\ ``-1.0``:
+         | |image1|
+
+
+         | ``flag =``\ ``0.0``:
+         | |image2|
+
+
+         | ``flag =``\ ``1.0``:
+         | |image3|
+
+
+         | ``flag =``\ ``-2.0``:
+         | |image4|
+
+
+         In the last three cases, the matrix entries of 1.0, -1.0, and
+         0.0 are assumed based on the value of ``flag`` and are not
+         required to be set in the ``param`` vector.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
+.. |image0| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee1.png
+.. |image1| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee2.png
+.. |image2| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee3.png
+.. |image3| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee4.png
+.. |image4| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee5.png
 
diff --git a/docs/domains/blas/sbmv.rst b/docs/domains/blas/sbmv.rst
index b28b9b027..e376818b0 100644
--- a/docs/domains/blas/sbmv.rst
+++ b/docs/domains/blas/sbmv.rst
@@ -1,4 +1,4 @@
-.. _sbmv:
+.. _onemkl_blas_sbmv:
 
 sbmv
 ====
@@ -10,16 +10,6 @@ sbmv
    Computes a matrix-vector product with a symmetric band matrix.
 
 
-   .. container:: section
-      :name: GUID-BEDE7E82-C168-498D-BF65-085BBCEF9A27
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void sbmv(queue &exec_queue, uplo upper_lower,      std::int64_t n, std::int64_t k, T alpha, buffer<T,1> &a,      std::int64_t lda, buffer<T,1> &x, std::int64_t incx, T beta,      buffer<T,1> &y, std::int64_t incy)
 
       ``sbmv`` supports the following precisions.
 
@@ -35,22 +25,17 @@ sbmv
 
 
 .. container:: section
-   :name: GUID-4F227157-1724-4D1F-AFAB-58C722CA8D08
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The sbmv routines compute a scalar-matrix-vector product and add the
+   The ``sbmv`` routines compute a scalar-matrix-vector product and add the
    result to a scalar-vector product, with a symmetric band matrix. The
    operation is defined as
 
 
-  
-
-
       y <- alpha*A*x + beta*y
 
 
@@ -67,23 +52,33 @@ sbmv
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+sbmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -138,11 +133,9 @@ sbmv
 
 
 .. container:: section
-   :name: GUID-ABBEA4DA-7B4C-489A-8063-BDC09FBB1ADD
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -150,15 +143,121 @@ sbmv
       Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+sbmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
+
+      k
+         Number of super-diagonals of the matrix ``A``. Must be at least
+         zero.
 
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least (``k`` +
+         1), and positive.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      beta
+         Scaling factor for vector ``y``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/scal.rst b/docs/domains/blas/scal.rst
index 97075eac6..ad4e9e753 100644
--- a/docs/domains/blas/scal.rst
+++ b/docs/domains/blas/scal.rst
@@ -1,4 +1,4 @@
-.. _scal:
+.. _onemkl_blas_scal:
 
 scal
 ====
@@ -10,16 +10,6 @@ scal
    Computes the product of a vector by a scalar.
 
 
-   .. container:: section
-      :name: GUID-178A4C6A-3BA5-40F7-A3D6-4B6590B75EB4
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void scal(queue &exec_queue, std::int64_t n,      T_scalar alpha, buffer<T,1> &x, std::int64_t incx)
 
       ``scal`` supports the following precisions.
 
@@ -46,18 +36,13 @@ scal
 
 
 .. container:: section
-   :name: GUID-8DDCA613-2750-43D0-A89B-13866F2DDE8C
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The scal routines computes a scalar-vector product:
-
-
-  
+   The ``scal`` routines computes a scalar-vector product:
 
 
       x <- alpha*x
@@ -72,16 +57,28 @@ scal
    ``alpha`` is a scalar.
 
 
+scal (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::scal(sycl::queue &queue, std::int64_t n, T_scalar alpha, sycl::buffer<T,1> &x, std::int64_t incx)
+
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -101,15 +98,13 @@ scal
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
 .. container:: section
-   :name: GUID-B36EBB3E-C79B-49F8-9F47-7B19BD6BE105
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -117,15 +112,77 @@ scal
       Buffer holding updated buffer ``x``.
 
 
-.. container:: familylinks
+scal (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::scal(sycl::queue &queue, std::int64_t n, T_scalar alpha, T *x, std::int64_t incx, const         sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
 
+
+      queue
+         The queue where the routine should be executed.
+
+
+      n
+         Number of elements in vector ``x``.
+
+
+      alpha
+         Specifies the scalar ``alpha``.
+
+
+      x
+         Pointer to the input vector ``x``. The array must be of size at
+         least (1 + (``n`` - 1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the updated array ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/sdsdot.rst b/docs/domains/blas/sdsdot.rst
index 11414fd5b..76ba70039 100644
--- a/docs/domains/blas/sdsdot.rst
+++ b/docs/domains/blas/sdsdot.rst
@@ -1,4 +1,4 @@
-.. _sdsdot:
+.. _onemkl_blas_sdsdot:
 
 sdsdot
 ======
@@ -10,35 +10,41 @@ sdsdot
    Computes a vector-vector dot product with double precision.
 
 
-   .. container:: section
-      :name: GUID-2DDFDC38-65FA-40F5-AACB-8E383623EF4A
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
 
+   .. container:: section
 
-      .. cpp:function::  void sdsdot(queue &exec_queue, std::int64_t n,      float sb, buffer<float,1> &x, std::int64_t incx, buffer<float,1>      &y, std::int64_t incy, buffer<float,1> &result)
 
       .. rubric:: Description
-         :name: description
          :class: sectiontitle
 
 
-      The sdsdot routines perform a dot product between two vectors with
+      The ``sdsdot`` routines perform a dot product between two vectors with
       double precision:
 
 
       |image0|
 
+sdsdot (Buffer Version)
+-----------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
 
+      .. cpp:function::  void onemkl::blas::sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer<float,1> &x, std::int64_t incx, sycl::buffer<float,1> &y, std::int64_t incy, sycl::buffer<float,1> &result)
+   .. container:: section
+   
+   
       .. rubric:: Input Parameters
-         :name: input-parameters
          :class: sectiontitle
 
 
-      exec_queue
+      queue
          The queue where the routine should be executed.
 
 
@@ -59,7 +65,7 @@ sdsdot
 
 
       incx
-         Stride of vector x.
+         Stride of vector ``x``.
 
 
       y
@@ -71,11 +77,12 @@ sdsdot
 
 
       incy
-         Stride of vector y.
-
+         Stride of vector ``y``.
 
+   .. container:: section
+   
+   
       .. rubric:: Output Parameters
-         :name: output-parameters
          :class: sectiontitle
 
 
@@ -83,11 +90,95 @@ sdsdot
          Buffer where the result (a scalar) will be stored. If ``n`` < 0
          the result is ``sb``.
 
+sdsdot (USM Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. container:: dlsyntaxpara
 
-      **Parent topic:** :ref:`blas-level-1-routines`
+
+         .. cpp:function::  sycl::event onemkl::blas::sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *result, const sycl::vector_class<sycl::event> &dependencies = {})
+      .. container:: section
+      
+      
+         .. rubric:: Input Parameters
+            :class: sectiontitle
+
+
+         queue
+            The queue where the routine should be executed.
+
+
+         n
+            Number of elements in vectors ``x`` and ``y``.
+
+
+         sb
+            Single precision scalar to be added to the dot product.
+
+
+         x
+            Pointer to the input vector ``x``. The array must be of size
+            at least (1 + (``n`` - 1)*abs(``incx``)). See `Matrix and
+            Vector
+            Storage <../matrix-storage.html>`__
+            for more details.
+
+
+         incx
+            Stride of vector ``x``.
+
+
+         y
+            Pointer to the input vector ``y``. The array must be of size
+            at least (1 + (``n`` - 1)*abs(``incxy``)). See `Matrix and
+            Vector
+            Storage <../matrix-storage.html>`__
+            for more details.
+
+
+         incy
+            Stride of vector ``y``.
+
+
+         dependencies
+            List of events to wait for before starting computation, if
+            any. If omitted, defaults to no dependencies.
+
+      .. container:: section
+      
       
+         .. rubric:: Output Parameters
+            :class: sectiontitle
+
+
+         result
+            Pointer to where the result (a scalar) will be stored. If
+            ``n`` < 0 the result is ``sb``.
+
+      .. container:: section
+   
+   
+         .. rubric:: Return Values
+            :class: sectiontitle
+
+
+         Output event to wait on to ensure computation is complete.
+
+.. container:: familylinks
+
 
+   .. container:: parentlink
 
-.. |image0| image:: ../equations/GUID-9DB212E1-03E2-430C-8B1F-8F5CBD4F2ee1.png
+         **Parent topic:** :ref:`blas-level-1-routines`
+.. |image0| image:: ../equations/GUID-9B91DAAE-72DD-4799-9983-12B021993ee1.png
    :class: img-middle
 
diff --git a/docs/domains/blas/spmv.rst b/docs/domains/blas/spmv.rst
index 0b1690df5..c744be625 100644
--- a/docs/domains/blas/spmv.rst
+++ b/docs/domains/blas/spmv.rst
@@ -1,4 +1,4 @@
-.. _spmv:
+.. _onemkl_blas_spmv:
 
 spmv
 ====
@@ -10,16 +10,6 @@ spmv
    Computes a matrix-vector product with a symmetric packed matrix.
 
 
-   .. container:: section
-      :name: GUID-BCC82B03-92EB-4D73-B69C-8AE8646FBEAC
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void spmv(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &a, buffer<T,1> &x,      std::int64_t incx, T beta, buffer<T,1> &y, std::int64_t incy)
 
       ``spmv`` supports the following precisions.
 
@@ -35,22 +25,17 @@ spmv
 
 
 .. container:: section
-   :name: GUID-D27BBFFF-79F4-4236-96A6-B305FA1858B0
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The spmv routines compute a scalar-matrix-vector product and add the
+   The ``spmv`` routines compute a scalar-matrix-vector product and add the
    result to a scalar-vector product, with a symmetric packed matrix.
    The operation is defined as
 
 
-  
-
-
       y <- alpha*A*x + beta*y
 
 
@@ -66,23 +51,33 @@ spmv
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+spmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &a, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -127,11 +122,9 @@ spmv
 
 
 .. container:: section
-   :name: GUID-23FF1F5C-5560-40B6-807D-B6352FA320D6
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -139,15 +132,111 @@ spmv
       Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+spmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
 
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      beta
+         Scaling factor for vector ``y``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/spr.rst b/docs/domains/blas/spr.rst
index 0112706b6..4b3c97e4d 100644
--- a/docs/domains/blas/spr.rst
+++ b/docs/domains/blas/spr.rst
@@ -1,4 +1,4 @@
-.. _spr:
+.. _onemkl_blas_spr:
 
 spr
 ===
@@ -10,16 +10,6 @@ spr
    Performs a rank-1 update of a symmetric packed matrix.
 
 
-   .. container:: section
-      :name: GUID-34904813-AFD9-4349-9DAC-A7221FBE9F97
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void spr(queue &exec_queue, uplo upper_lower,      std::std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &a)
 
       ``spr`` supports the following precisions.
 
@@ -32,24 +22,17 @@ spr
          * -  ``double`` 
 
 
-
-
 .. container:: section
-   :name: GUID-E387B33A-CA59-45D8-BB01-31DF76C82A0D
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The spr routines compute a scalar-vector-vector product and add the
+   The ``spr`` routines compute a scalar-vector-vector product and add the
    result to a symmetric packed matrix. The operation is defined as
 
 
-  
-
-
       A <- alpha*x*x :sup:`T` + A
 
 
@@ -65,23 +48,33 @@ spr
    ``x`` is a vector of length ``n``.
 
 
+spr (Buffer Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::spr(sycl::queue &queue, uplo upper_lower, std::std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &a)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -111,35 +104,111 @@ spr
 
 
 .. container:: section
-   :name: GUID-9FBC2F3B-EB8F-4733-ABBA-08D5685A761B
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
-   **sycl:**
-       
-
-
-
    a
-      Buffer holding the updated upper triangularpart of the symmetric
+      Buffer holding the updated upper triangular part of the symmetric
       matrix ``A`` if ``upper_lower =upper``, or the updated lower
-      triangular part of thesymmetric matrix ``A`` if
+      triangular part of the symmetric matrix ``A`` if
       ``upper_lower =lower``.
 
 
-.. container:: familylinks
+spr (USM Version)
+-----------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
 
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``-n))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the symmetric
+         matrix ``A`` if ``upper_lower =upper``, or the updated lower
+         triangular part of the symmetric matrix ``A`` if
+         ``upper_lower =lower``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/spr2.rst b/docs/domains/blas/spr2.rst
index ca78f30ac..dd013a716 100644
--- a/docs/domains/blas/spr2.rst
+++ b/docs/domains/blas/spr2.rst
@@ -1,4 +1,4 @@
-.. _spr2:
+.. _onemkl_blas_spr2:
 
 spr2
 ====
@@ -10,16 +10,6 @@ spr2
    Computes a rank-2 update of a symmetric packed matrix.
 
 
-   .. container:: section
-      :name: GUID-44B72132-1EC0-41FA-9189-4596CFD651B0
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void spr2(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &y, std::int64_t incy, buffer<T,1> &a)
 
       ``spr`` supports the following precisions.
 
@@ -35,15 +25,13 @@ spr2
 
 
 .. container:: section
-   :name: GUID-3AF7EB4D-B3FE-4C0A-B7A0-6E286D4C642F
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The spr2 routines compute two scalar-vector-vector products and add
+   The ``spr2`` routines compute two scalar-vector-vector products and add
    them to a symmetric packed matrix. The operation is defined as
 
 
@@ -65,23 +53,32 @@ spr2
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+spr2 (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::spr2(sycl::queue &queue, uplo upper_lower,      std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx,      sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &a)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -122,35 +119,123 @@ spr2
 
 
 .. container:: section
-   :name: GUID-9796BA93-31FB-40B9-B139-219905913736
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
-   **sycl:**
-       
-
-
-
    a
-      Buffer holding the updated upper triangularpart of the symmetric
+      Buffer holding the updated upper triangular part of the symmetric
       matrix ``A`` if ``upper_lower =upper`` or the updated lower
-      triangular part of thesymmetric matrix ``A`` if
+      triangular part of the symmetric matrix ``A`` if
       ``upper_lower =lower``.
 
 
-.. container:: familylinks
+spr2 (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::spr2(sycl::queue &queue, uplo         upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t         incx, const T *y, std::int64_t incy, T *a)
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``-1))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the symmetric
+         matrix ``A`` if ``upper_lower =upper`` or the updated lower
+         triangular part of the symmetric matrix ``A`` if
+         ``upper_lower =lower``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/swap.rst b/docs/domains/blas/swap.rst
index 3d4542779..18e22a5b5 100644
--- a/docs/domains/blas/swap.rst
+++ b/docs/domains/blas/swap.rst
@@ -1,4 +1,4 @@
-.. _swap:
+.. _onemkl_blas_swap:
 
 swap
 ====
@@ -10,18 +10,8 @@ swap
    Swaps a vector with another vector.
 
 
-   .. container:: section
-      :name: GUID-F0DF0055-DF25-4EC7-8FF2-48D4FA91E42E
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
 
-
-      .. cpp:function::  void swap(queue &exec_queue, std::int64_t n,      buffer<T,1> &x, std::int64_t incx, buffer<T,1> &y, std::int64_t      incy)
-
-      swap supports the following precisions.
+      ``swap`` supports the following precisions.
 
 
       .. list-table:: 
@@ -34,38 +24,43 @@ swap
          * -  ``std::complex<double>`` 
 
 
-
-
 .. container:: section
-   :name: GUID-FE88C4B7-4C74-41F8-94DE-E62888DD3BA4
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   Given two vectors of ``n`` elements, ``x`` and ``y``, the swap
+   Given two vectors of ``n`` elements, ``x`` and ``y``, the ``swap``
    routines return vectors ``y`` and ``x`` swapped, each replacing the
    other.
 
 
-  
+      y <- x, x <- y
 
 
-      y <- x, x <- y
+swap (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
+      .. cpp:function::  void onemkl::blas::swap(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy)
+
 .. container:: section
-   :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
@@ -81,7 +76,7 @@ swap
 
 
    incx
-      Stride of vector x.
+      Stride of vector ``x``.
 
 
    y
@@ -92,15 +87,13 @@ swap
 
 
    incy
-      Stride of vector y.
+      Stride of vector ``y``.
 
 
 .. container:: section
-   :name: GUID-106AC665-DCBA-40ED-8779-0D9017064855
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -114,15 +107,95 @@ swap
       ``x``.
 
 
-.. container:: familylinks
+swap (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-1-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::swap(sycl::queue &queue, std::int64_t n, T *x, std::int64_t incx, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
 
+      n
+         Number of elements in vector ``x``.
+
+
+      x
+         Pointer to the input vector ``x``. The array must be of size at
+         least (1 + (``n`` - 1)*abs(``incx``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to the input vector ``y``. The array must be of size at
+         least (1 + (``n`` - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the updated array ``x``, that is, the input vector
+         ``y``.
+
+
+      y
+         Pointer to the updated array ``y``, that is, the input vector
+         ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/symm.rst b/docs/domains/blas/symm.rst
index c14d9d2bf..87b1252de 100644
--- a/docs/domains/blas/symm.rst
+++ b/docs/domains/blas/symm.rst
@@ -1,4 +1,4 @@
-.. _symm:
+.. _onemkl_blas_symm:
 
 symm
 ====
@@ -11,18 +11,8 @@ symm
    and one matrix is general.
 
 
-   .. container:: section
-      :name: GUID-BFE36A6B-941E-4B49-AB0E-CFB687B1AD64
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void symm(queue &exec_queue, side left_right,      uplo upper_lower, std::int64_t m, std::int64_t n, T alpha,      buffer<T,1> &a, std::int64_t lda, buffer<T,1> &b, std::int64_t      ldb, T beta, buffer<T,1> &c, std::int64_t ldc)
 
-      symm supports the following precisions.
+      ``symm`` supports the following precisions.
 
 
       .. list-table:: 
@@ -35,18 +25,14 @@ symm
          * -  ``std::complex<double>`` 
 
 
-
-
 .. container:: section
-   :name: GUID-E8FE37B0-C527-4AA6-B57F-AE3F4843F23A
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The symm routines compute a scalar-matrix-matrix product and add the
+   The ``symm`` routines compute a scalar-matrix-matrix product and add the
    result to a scalar-matrix product, where one of the matrices in the
    multiplication is symmetric. The argument ``left_right`` determines
    if the symmetric matrix, ``A``, is on the left of the multiplication
@@ -55,18 +41,11 @@ symm
    defined as
 
 
-  
-
-
       C <- alpha*A*B + beta*C,
 
-
    or
 
 
-  
-
-
       C <- alpha*B*A + beta*C,
 
 
@@ -82,31 +61,39 @@ symm
    ``B`` and ``C`` are ``m``-by-``n`` matrices.
 
 
+symm (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, T beta, sycl::buffer<T,1> &c, std::int64_t ldc)
+
 .. container:: section
-   :name: GUID-70716375-C54E-4AA6-94DC-65AF79D46BB2
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    left_right
       Specifies whether ``A`` is on the left side of the multiplication
-      (``side::left``) or on the right side (``side::right``). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details.
 
 
    upper_lower
-      Specifies whether *A*'s data is stored in its upper or lower
-      triangle. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A``'s data is stored in its upper or lower
+      triangle. See :ref:`onemkl_datatypes` for more details.
 
 
    m
@@ -164,11 +151,9 @@ symm
 
 
 .. container:: section
-   :name: GUID-DD569858-5D3C-4565-8BAB-FE548427DCF2
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -180,11 +165,9 @@ symm
 
 
 .. container:: section
-   :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24
 
 
    .. rubric:: Notes
-      :name: notes
       :class: sectiontitle
 
 
@@ -192,15 +175,146 @@ symm
    calling ``symm``.
 
 
-.. container:: familylinks
+symm (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      left_right
+         Specifies whether ``A`` is on the left side of the
+         multiplication (``side::left``) or on the right side
+         (``side::right``). See :ref:`onemkl_datatypes` for more details.
+
+
+      upper_lower
+         Specifies whether ``A``'s data is stored in its upper or lower
+         triangle. See :ref:`onemkl_datatypes` for more details.
+
+
+      m
+         Number of rows of ``B`` and ``C``. The value of ``m`` must be
+         at least zero.
+
+
+      n
+         Number of columns of ``B`` and ``C``. The value of ``n`` must
+         be at least zero.
+
 
+      alpha
+         Scaling factor for the matrix-matrix product.
+
+
+      a
+         Pointer to input matrix ``A``. Must have size at least
+         ``lda``\ \*\ ``m`` if ``A`` is on the left of the
+         multiplication, or ``lda``\ \*\ ``n`` if ``A`` is on the right.
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``m`` if ``A`` is
+         on the left of the multiplication, or at least ``n`` if ``A``
+         is on the right. Must be positive.
+
+
+      b
+         Pointer to input matrix ``B``. Must have size at least
+         ``ldb``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldb
+         Leading dimension of ``B``. Must be positive and at least
+         ``m``.
+
+
+      beta
+         Scaling factor for matrix ``C``.
+
+
+      c
+         Pointer to input/output matrix ``C``. Must have size at least
+         ``ldc``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldc
+         Leading dimension of ``C``. Must be positive and at least
+         ``m``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Pointer to the output matrix, overwritten by
+         ``alpha``\ \*\ ``A``\ \*\ ``B`` + ``beta``\ \*\ ``C``
+         (``left_right`` = ``side::left``) or
+         ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C``
+         (``left_right`` = ``side::right``).
+
+
+   .. container:: section
+
+
+      .. rubric:: Notes
+         :class: sectiontitle
+
+
+      If ``beta`` = 0, matrix ``C`` does not need to be initialized
+      before calling ``symm``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/symv.rst b/docs/domains/blas/symv.rst
index 8d59ed90a..d7abca890 100644
--- a/docs/domains/blas/symv.rst
+++ b/docs/domains/blas/symv.rst
@@ -1,4 +1,4 @@
-.. _symv:
+.. _onemkl_blas_symv:
 
 symv
 ====
@@ -10,16 +10,6 @@ symv
    Computes a matrix-vector product for a symmetric matrix.
 
 
-   .. container:: section
-      :name: GUID-1E9C9EA9-0366-420E-A704-AB605C8ED92A
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void symv(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &a, std::int64_t lda,      buffer<T,1> &x, std::int64_t incx, T beta, buffer<T,1> &y,      std::int64_t incy)
 
       ``symv`` supports the following precisions.
 
@@ -32,25 +22,18 @@ symv
          * -  ``double`` 
 
 
-
-
 .. container:: section
-   :name: GUID-DE8D8321-D53D-4226-A940-CDE0E720EC95
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The symv routines routines compute a scalar-matrix-vector product and
+   The ``symv`` routines routines compute a scalar-matrix-vector product and
    add the result to a scalar-vector product, with a symmetric matrix.
    The operation is defined as
 
 
-  
-
-
       y <- alpha*A*x + beta*y
 
 
@@ -66,23 +49,32 @@ symv
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+symv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx, T beta, sycl::buffer<T,1> &y, std::int64_t incy)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -128,11 +120,9 @@ symv
 
 
 .. container:: section
-   :name: GUID-E16C8443-A2A4-483C-9D46-FF428E80FEB0
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -140,15 +130,112 @@ symv
       Buffer holding the updated vector ``y``.
 
 
-.. container:: familylinks
+symv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
 
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``m``, and
+         positive.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      y
+         Pointer to the updated vector ``y``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/syr.rst b/docs/domains/blas/syr.rst
index e3ff12e6b..8eca1cfe3 100644
--- a/docs/domains/blas/syr.rst
+++ b/docs/domains/blas/syr.rst
@@ -1,4 +1,4 @@
-.. _syr:
+.. _onemkl_blas_syr:
 
 syr
 ===
@@ -10,16 +10,6 @@ syr
    Computes a rank-1 update of a symmetric matrix.
 
 
-   .. container:: section
-      :name: GUID-E620D36F-6B4E-40A6-8BDA-3D625DEF55A8
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void syr(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &a, std::int64_t lda)
 
       ``syr`` supports the following precisions.
 
@@ -32,25 +22,18 @@ syr
          * -  ``double`` 
 
 
-
-
 .. container:: section
-   :name: GUID-E154DE4B-4559-4471-B92B-46AF8777AC97
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The syr routines compute a scalar-vector-vector product add them and
+   The ``syr`` routines compute a scalar-vector-vector product add them and
    add the result to a matrix, with a symmetric matrix. The operation is
    defined as
 
 
-  
-
-
       A  <- alpha*x*x :sup:`T` + A
 
 
@@ -66,23 +49,33 @@ syr
    ``x`` is a vector of length ``n``.
 
 
+syr (Buffer Version)
+--------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &a, std::int64_t lda)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -117,30 +110,116 @@ syr
 
 
 .. container:: section
-   :name: GUID-C03D1215-FD77-4AD8-8FA2-C48A5D8B938C
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    a
-      Buffer holding the updated upper triangularpart of the symmetric
+      Buffer holding the updated upper triangular part of the symmetric
       matrix ``A`` if ``upper_lower =upper`` or the updated lower
-      triangular part of thesymmetric matrix ``A`` if
+      triangular part of the symmetric matrix ``A`` if
       ``upper_lower =lower``.
 
 
-.. container:: familylinks
+syr (USM Version)
+-----------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, std::int64_t lda, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
 
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of columns of ``A``. Must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``n``, and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the symmetric
+         matrix ``A`` if ``upper_lower =upper`` or the updated lower
+         triangular part of the symmetric matrix ``A`` if
+         ``upper_lower =lower``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/syr2.rst b/docs/domains/blas/syr2.rst
index 6459801cf..acded7ff8 100644
--- a/docs/domains/blas/syr2.rst
+++ b/docs/domains/blas/syr2.rst
@@ -1,4 +1,4 @@
-.. _syr2:
+.. _onemkl_blas_syr2:
 
 syr2
 ====
@@ -10,16 +10,6 @@ syr2
    Computes a rank-2 update of a symmetric matrix.
 
 
-   .. container:: section
-      :name: GUID-580F2222-D47E-43A3-B9A2-037F353825D5
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void syr2(queue &exec_queue, uplo upper_lower,      std::int64_t n, T alpha, buffer<T,1> &x, std::int64_t incx,      buffer<T,1> &y, std::int64_t incy, buffer<T,1> &a, std::int64_t      lda)
 
       ``syr2`` supports the following precisions.
 
@@ -32,25 +22,18 @@ syr2
          * -  ``double`` 
 
 
-
-
 .. container:: section
-   :name: GUID-CDA05459-F2FE-4933-A552-D6E52EC46D13
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The syr2 routines compute two scalar-vector-vector product add them
+   The ``syr2`` routines compute two scalar-vector-vector product add them
    and add the result to a matrix, with a symmetric matrix. The
    operation is defined as
 
 
-  
-
-
       A <- alpha*x*y :sup:`T` + alpha*y*x :sup:`T` + A
 
 
@@ -66,23 +49,33 @@ syr2
    ``x`` and ``y`` are vectors of length ``n``.
 
 
+syr2 (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer<T,1> &x, std::int64_t incx, sycl::buffer<T,1> &y, std::int64_t incy, sycl::buffer<T,1> &a, std::int64_t lda)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -128,30 +121,128 @@ syr2
 
 
 .. container:: section
-   :name: GUID-6992A39F-8AB7-42D9-B126-4F8ECF9C1ECE
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    a
-      Buffer holding the updated upper triangularpart of the symmetric
+      Buffer holding the updated upper triangular part of the symmetric
       matrix ``A`` if ``upper_lower =upper``, or the updated lower
-      triangular part of thesymmetric matrix ``A`` if
+      triangular part of the symmetric matrix ``A`` if
       ``upper_lower =lower``.
 
 
-.. container:: familylinks
+syr2 (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
+      n
+         Number of columns of ``A``. Must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-vector product.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      y
+         Pointer to input/output vector ``y``. The array holding
+         input/output vector ``y`` must be of size at least (1 + (``n``
+         - 1)*abs(``incy``)). See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incy
+         Stride of vector ``y``.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``n``, and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      a
+         Pointer to the updated upper triangular part of the symmetric
+         matrix ``A`` if ``upper_lower =upper``, or the updated lower
+         triangular part of the symmetric matrix ``A`` if
+         ``upper_lower =lower``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/syr2k.rst b/docs/domains/blas/syr2k.rst
index e5687c856..3299e5da8 100644
--- a/docs/domains/blas/syr2k.rst
+++ b/docs/domains/blas/syr2k.rst
@@ -1,4 +1,4 @@
-.. _syr2k:
+.. _onemkl_blas_syr2k:
 
 syr2k
 =====
@@ -10,18 +10,8 @@ syr2k
    Performs a symmetric rank-2k update.
 
 
-   .. container:: section
-      :name: GUID-EED2648B-6435-4DD1-AC36-21039DFC61DD
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
 
-      .. cpp:function::  void syr2k(queue &exec_queue, uplo upper_lower,      transpose trans, std::int64_t n, std::int64_t k, T alpha,      buffer<T,1> &a, std::int64_t lda, buffer<T,1> &b, std::int64_t      ldb, T beta, buffer<T,1> &c, std::int64_t ldc)
-
-      syr2k supports the following precisions:
+      ``syr2k`` supports the following precisions:
 
 
       .. list-table:: 
@@ -37,35 +27,27 @@ syr2k
 
 
 .. container:: section
-   :name: GUID-1FB46B8F-1B13-4A6B-A3A5-0A5B34049068
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The syr2k routines perform a rank-2k update of an ``n`` x ``n``
+   The ``syr2k`` routines perform a rank-2k update of an ``n`` x ``n``
    symmetric matrix ``C`` by general matrices ``A`` and ``B``. If
    ``trans`` = ``transpose::nontrans``, the operation is defined as:
 
 
-  
-
-
       C <- alpha*(A*B :sup:`T` + B*A :sup:`T`) + beta*C
 
 
    where ``A`` is ``n`` x ``k`` and ``B`` is ``k`` x ``n``.
 
 
-   If ``trans`` = ``transpose::trans``, the operationis defined as:
-
-
-  
+   If ``trans`` = ``transpose::trans``, the operation is defined as:
 
 
-      C <- alpha*(A :sup:`T`*B + B :sup:`T`*A) + beta*C
+      C <- alpha*(A :sup:`T` * B + B :sup:`T` * A) + beta * C
 
 
    where ``A`` is ``k`` x ``n`` and ``B`` is ``n`` x ``k``.
@@ -83,24 +65,34 @@ syr2k
    The inner dimension of both matrix multiplications is ``k``.
 
 
+syr2k (Buffer Version)
+----------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, T beta, sycl::buffer<T,1> &c, std::int64_t ldc)
+
 .. container:: section
-   :name: GUID-3EBEFBDD-93AF-4376-9BA2-A7042179BF13
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
       Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      triangle. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
@@ -170,27 +162,148 @@ syr2k
 
 
 .. container:: section
-   :name: GUID-5779F783-54BC-4887-9CBB-96B8EC9F00E9
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
    c
-      Output buffer, overwritten by the updated C matrix.
+      Output buffer, overwritten by the updated ``C`` matrix.
 
 
-.. container:: familylinks
+syr2k (USM Version)
+-------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A``'s data is stored in its upper or lower
+         triangle. See :ref:`onemkl_datatypes` for more details.
+
+
+      trans
+         Specifies the operation to apply, as described above.
+         Conjugation is never performed, even if ``trans`` =
+         ``transpose::conjtrans``.
+
+
+      n
+         Number of rows and columns in ``C``. The value of ``n`` must be
+         at least zero.
 
+
+      k
+         Inner dimension of matrix multiplications.The value of ``k``
+         must be at least zero.
+
+
+      alpha
+         Scaling factor for the rank-2\ ``k`` update.
+
+
+      a
+         Pointer to input matrix ``A``. If ``A`` is not transposed,
+         ``A`` is an ``m``-by-``k`` matrix so the array ``a`` must have
+         size at least ``lda``\ \*\ ``k``. If ``A`` is transposed, ``A``
+         is an ``k``-by-``m`` matrix so the array ``a`` must have size
+         at least ``lda``\ \*\ ``m``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``n`` if ``trans``
+         = ``transpose::nontrans``, and at least ``k`` otherwise. Must
+         be positive.
+
+
+      b
+         Pointer to input matrix ``B``. If ``trans`` =
+         ``transpose::nontrans``, ``B`` is an ``k``-by-``n`` matrix so
+         the array ``b`` must have size at least ``ldb``\ \*\ ``n``.
+         Otherwise, ``B`` is an ``n``-by-``k`` matrix so the array ``b``
+         must have size at least ``ldb``\ \*\ ``k``. See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldb
+         Leading dimension of ``B``. Must be at least ``k`` if ``trans``
+         = ``transpose::nontrans``, and at least ``n`` otherwise. Must
+         be positive.
+
+
+      beta
+         Scaling factor for matrix ``C``.
+
+
+      c
+         Pointer to input/output matrix ``C``. Must have size at least
+         ``ldc``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details
+
+
+      ldc
+         Leading dimension of ``C``. Must be positive and at least
+         ``n``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Pointer to the output matrix, overwritten by the updated ``C``
+         matrix.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/syrk.rst b/docs/domains/blas/syrk.rst
index d097db2dd..aa94d0767 100644
--- a/docs/domains/blas/syrk.rst
+++ b/docs/domains/blas/syrk.rst
@@ -1,4 +1,4 @@
-.. _syrk:
+.. _onemkl_blas_syrk:
 
 syrk
 ====
@@ -10,18 +10,8 @@ syrk
    Performs a symmetric rank-k update.
 
 
-   .. container:: section
-      :name: GUID-F8123F9B-A182-4BDB-A1A3-90FEC4F56231
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
 
-      .. cpp:function::  void syrk(queue &exec_queue, uplo upper_lower,      transpose trans, std::int64_t n, std::int64_t k, T alpha,      buffer<T,1> &a, std::int64_t lda, T beta, buffer<T,1> &c,      std::int64_t ldc)
-
-      syrk supports the following precisions.
+      ``syrk`` supports the following precisions.
 
 
       .. list-table:: 
@@ -34,23 +24,17 @@ syrk
          * -  ``std::complex<double>`` 
 
 
-
-
 .. container:: section
-   :name: GUID-8E133139-EE58-44B8-A507-2263BDD1399B
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The syrk routines perform a rank-k update of a symmetric matrix ``C``
+   The ``syrk`` routines perform a rank-k update of a symmetric matrix ``C``
    by a general matrix ``A``. The operation is defined as:
 
 
-  
-
 
       C <- alpha*op(A)*op(A)T + beta*C
 
@@ -71,31 +55,38 @@ syrk
    Here op(``A``) is ``n``-by-``k``, and ``C`` is ``n``-by-``n``.
 
 
+syrk (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, T beta, sycl::buffer<T,1> &c, std::int64_t ldc)
+
 .. container:: section
-   :name: GUID-96D007CC-23F0-46FA-9085-6DBFC5BB30E6
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
       Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      triangle. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``
-      (See
-      :ref:`onemkl_datatypes` for more
-      details). Conjugation is never performed, even if ``trans`` =
+      Specifies op(``A``), the transposition operation applied to ``A`` (See :ref:`onemkl_datatypes` for more details). Conjugation is never performed, even if ``trans`` =
       ``transpose::conjtrans``.
 
 
@@ -145,11 +136,9 @@ syrk
 
 
 .. container:: section
-   :name: GUID-E14CE68E-2E28-48BB-8FD7-B84A21563BDA
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -158,15 +147,118 @@ syrk
       ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` + ``beta``\ \*\ ``C``.
 
 
-.. container:: familylinks
+syrk (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, T beta, T* c, std::int64_t ldc, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A``'s data is stored in its upper or lower
+         triangle. See :ref:`onemkl_datatypes` for more details.
+
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A`` (See :ref:`onemkl_datatypes` for more details). Conjugation is never performed, even if
+         ``trans`` = ``transpose::conjtrans``.
 
+
+      n
+         Number of rows and columns in ``C``. The value of ``n`` must be
+         at least zero.
+
+
+      k
+         Number of columns in op(``A``). The value of ``k`` must be at
+         least zero.
+
+
+      alpha
+         Scaling factor for the rank-``k`` update.
+
+
+      a
+         Pointer to input matrix ``A``. If ``trans`` =
+         ``transpose::nontrans``, ``A`` is an ``n``-by-``k`` matrix so
+         the array ``a`` must have size at least ``lda``\ \*\ ``k``.
+         Otherwise, ``A`` is an ``k``-by-``n`` matrix so the array ``a``
+         must have size at least ``lda``\ \*\ ``n``. See `Matrix and
+         Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``n`` if ``A`` is
+         not transposed, and at least ``k`` if ``A`` is transposed. Must
+         be positive.
+
+
+      beta
+         Scaling factor for matrix ``C``.
+
+
+      c
+         Pointer to input/output matrix ``C``. Must have size at least
+         ``ldc``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldc
+         Leading dimension of ``C``. Must be positive and at least
+         ``n``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      c
+         Pointer to the output matrix, overwritten by
+         ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` +
+         ``beta``\ \*\ ``C``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/tbmv.rst b/docs/domains/blas/tbmv.rst
index eb5f7acf0..cb570220a 100644
--- a/docs/domains/blas/tbmv.rst
+++ b/docs/domains/blas/tbmv.rst
@@ -1,4 +1,4 @@
-.. _tbmv:
+.. _onemkl_blas_tbmv:
 
 tbmv
 ====
@@ -10,16 +10,6 @@ tbmv
    Computes a matrix-vector product using a triangular band matrix.
 
 
-   .. container:: section
-      :name: GUID-BAC06253-0516-4F7F-97E6-C4CBA2DBB1A2
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void tbmv(queue &exec_queue, uplo upper_lower,      transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t      k, buffer<T,1> &a, std::int64_t lda, buffer<T,1> &x, std::int64_t      incx)
 
       ``tbmv`` supports the following precisions.
 
@@ -37,21 +27,16 @@ tbmv
 
 
 .. container:: section
-   :name: GUID-4279E883-09A1-48F0-B9DA-8A1E86886B17
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The tbmv routines compute a matrix-vector product with a triangular
+   The ``tbmv`` routines compute a matrix-vector product with a triangular
    band matrix. The operation is defined as
 
 
-  
-
-
       x <- op(A)*x
 
 
@@ -69,36 +54,41 @@ tbmv
    ``x`` is a vector of length ``n``.
 
 
+tbmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
+
 
 
    unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See
-      :ref:`onemkl_datatypes`
-      for more details.
+      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -134,11 +124,9 @@ tbmv
 
 
 .. container:: section
-   :name: GUID-0B96A584-2EC7-484C-9FB0-C632053F0461
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -146,15 +134,110 @@ tbmv
       Buffer holding the updated vector ``x``.
 
 
-.. container:: familylinks
+tbmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
 
+
+      unit_nonunit
+         Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Numbers of rows and columns of ``A``. Must be at least zero.
+
+
+      k
+         Number of sub/super-diagonals of the matrix ``A``. Must be at
+         least zero.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least (``k`` +
+         1), and positive.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the updated vector ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/tbsv.rst b/docs/domains/blas/tbsv.rst
index 73aab67bd..03156b461 100644
--- a/docs/domains/blas/tbsv.rst
+++ b/docs/domains/blas/tbsv.rst
@@ -1,4 +1,4 @@
-.. _tbsv:
+.. _onemkl_blas_tbsv:
 
 tbsv
 ====
@@ -11,16 +11,6 @@ tbsv
    triangular band matrix.
 
 
-   .. container:: section
-      :name: GUID-4AC7186F-2D61-44C2-95BC-5981E750A021
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void tbsv(queue &exec_queue, uplo upper_lower,      transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t      k, buffer<T,1> &a, std::int64_t lda, buffer<T,1> &x, std::int64_t      incx)
 
       ``tbsv`` supports the following precisions.
 
@@ -38,22 +28,17 @@ tbsv
 
 
 .. container:: section
-   :name: GUID-5AF4221C-AB14-4F9B-97A8-CAA78DF05E36
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The tbsv routines solve a system of linear equations whose
+   The ``tbsv`` routines solve a system of linear equations whose
    coefficients are in a triangular band matrix. The operation is
    defined as
 
 
-  
-
-
       op(A)*x = b
 
 
@@ -71,36 +56,40 @@ tbsv
    ``b`` and ``x`` are vectors of length ``n``.
 
 
+tbsv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t      incx)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See
-      :ref:`onemkl_datatypes`
-      for more details.
+      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -136,11 +125,9 @@ tbsv
 
 
 .. container:: section
-   :name: GUID-24B3C6B8-7FBD-4B24-84F2-242635B3026E
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -148,15 +135,110 @@ tbsv
       Buffer holding the solution vector ``x``.
 
 
-.. container:: familylinks
+tbsv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
+
 
+      unit_nonunit
+         Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Number of rows and columns of ``A``. Must be at least zero.
+
+
+      k
+         Number of sub/super-diagonals of the matrix ``A``. Must be at
+         least zero.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least (``k`` +
+         1), and positive.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the solution vector ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/tpmv.rst b/docs/domains/blas/tpmv.rst
index f8bd2b136..ac49c38f0 100644
--- a/docs/domains/blas/tpmv.rst
+++ b/docs/domains/blas/tpmv.rst
@@ -1,4 +1,4 @@
-.. _tpmv:
+.. _onemkl_blas_tpmv:
 
 tpmv
 ====
@@ -10,16 +10,6 @@ tpmv
    Computes a matrix-vector product using a triangular packed matrix.
 
 
-   .. container:: section
-      :name: GUID-5785B6D6-DB9C-43FA-B98A-009D5E077A9D
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void tpmv(queue &exec_queue, uplo upper_lower,      transpose trans, diag unit_nonunit, std::int64_t n, buffer<T,1>      &a, buffer<T,1> &x, std::int64_t incx)
 
       ``tpmv`` supports the following precisions.
 
@@ -37,21 +27,16 @@ tpmv
 
 
 .. container:: section
-   :name: GUID-A045480A-2EC1-4C73-A836-468324FCC85A
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The tpmv routines compute a matrix-vector product with a triangular
+   The ``tpmv`` routines compute a matrix-vector product with a triangular
    packed matrix. The operation is defined as
 
 
-  
-
-
       x <- op(A)*x
 
 
@@ -69,36 +54,41 @@ tpmv
    ``x`` is a vector of length ``n``.
 
 
+tpmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, sycl::buffer<T,1> &a, sycl::buffer<T,1> &x, std::int64_t incx)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See
-      :ref:`onemkl_datatypes`
-      for more details.
+      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -124,11 +114,9 @@ tpmv
 
 
 .. container:: section
-   :name: GUID-180038D9-902F-4B20-AB6B-E38F2A6C83E4
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -136,15 +124,100 @@ tpmv
       Buffer holding the updated vector ``x``.
 
 
-.. container:: familylinks
+tpmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, const T *a, T *x, std::int64_t incx, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
+
+
+      unit_nonunit
+         Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Numbers of rows and columns of ``A``. Must be at least zero.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the updated vector ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/tpsv.rst b/docs/domains/blas/tpsv.rst
index 0ec419bd0..3bd98ad73 100644
--- a/docs/domains/blas/tpsv.rst
+++ b/docs/domains/blas/tpsv.rst
@@ -1,4 +1,4 @@
-.. _tpsv:
+.. _onemkl_blas_tpsv:
 
 tpsv
 ====
@@ -11,16 +11,6 @@ tpsv
    triangular packed matrix.
 
 
-   .. container:: section
-      :name: GUID-230CF8CA-B38D-4CB6-9917-029FEF53EBED
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void tpsv(queue &exec_queue, uplo upper_lower,      transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t      k, buffer<T,1> &a, buffer<T,1> &x, std::int64_t incx)
 
       ``tpsv`` supports the following precisions.
 
@@ -38,22 +28,17 @@ tpsv
 
 
 .. container:: section
-   :name: GUID-7AD9F8E2-1343-4A6D-8C6A-F68D934292B7
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The tpsv routines solve a system of linear equations whose
+   The ``tpsv`` routines solve a system of linear equations whose
    coefficients are in a triangular packed matrix. The operation is
    defined as
 
 
-  
-
-
       op(A)*x = b
 
 
@@ -71,36 +56,40 @@ tpsv
    ``b`` and ``x`` are vectors of length ``n``.
 
 
+tpsv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, sycl::buffer<T,1> &x, std::int64_t incx)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See
-      :ref:`onemkl_datatypes`
-      for more details.
+      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -127,11 +116,9 @@ tpsv
 
 
 .. container:: section
-   :name: GUID-F515C77C-1E84-424B-A00A-874ACBEFBF9E
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -139,15 +126,101 @@ tpsv
       Buffer holding the solution vector ``x``.
 
 
-.. container:: familylinks
+tpsv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, T *x, std::int64_t incx, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
 
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
+
+
+      unit_nonunit
+         Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Numbers of rows and columns of ``A``. Must be at least zero.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      x
+         Pointer to the ``n``-element right-hand side vector ``b``. The
+         array holding the ``n``-element right-hand side vector ``b``
+         must be of size at least (1 + (``n`` - 1)*abs(``incx``)). See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the solution vector ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/trmm.rst b/docs/domains/blas/trmm.rst
index 2dbbc85b0..0c61ec772 100644
--- a/docs/domains/blas/trmm.rst
+++ b/docs/domains/blas/trmm.rst
@@ -1,4 +1,4 @@
-.. _trmm:
+.. _onemkl_blas_trmm:
 
 trmm
 ====
@@ -11,18 +11,8 @@ trmm
    and one input matrix is general.
 
 
-   .. container:: section
-      :name: GUID-15B16EFC-8B31-4459-88DC-A8C5EF6C9932
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
 
-      .. cpp:function::  void trmm(queue &exec_queue, uplo upper_lower,      transpose transa, diag unit_diag, std::int64_t m, std::int64_t n,      T alpha, buffer<T,1> &a, std::int64_t lda, buffer<T,1> &b,      std::int64_t ldb)
-
-      trmm supports the following precisions.
+      ``trmm`` supports the following precisions.
 
 
       .. list-table:: 
@@ -35,18 +25,14 @@ trmm
          * -  ``std::complex<double>`` 
 
 
-
-
 .. container:: section
-   :name: GUID-E1AAECF3-E29D-411F-B052-2F2E8080F3A1
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The trmm routines compute a scalar-matrix-matrix product where one of
+   The ``trmm`` routines compute a scalar-matrix-matrix product where one of
    the matrices in the multiplication is triangular. The argument
    ``left_right`` determines if the triangular matrix, ``A``, is on the
    left of the multiplication (``left_right`` = ``side::left``) or on
@@ -54,8 +40,6 @@ trmm
    ``left_right``. The operation is defined as
 
 
-  
-
 
       B <- alpha*op(A)*B
 
@@ -63,9 +47,6 @@ trmm
    or
 
 
-  
-
-
       B <- alpha*B*op(A)
 
 
@@ -86,45 +67,46 @@ trmm
    ``n`` x ``n``, depending on ``left_right``.
 
 
+trmm (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::trmm(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb)
 .. container:: section
-   :name: GUID-DE8B0FD7-11E3-42BC-99ED-3A07040FA6CB
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    left_right
       Specifies whether ``A`` is on the left side of the multiplication
-      (``side::left``) or on the right side (``side::right``). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details.
 
 
    uplo
-      Specifies whether the matrix ``A`` is upper or lower triangular.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether the matrix ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_diag
       Specifies whether ``A`` is assumed to be unit triangular (all
-      diagonal elements are 1). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
 
 
    m
@@ -168,11 +150,9 @@ trmm
 
 
 .. container:: section
-   :name: GUID-1F1FF9D8-3833-4C9E-9CAC-53BA1791DCF1
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -182,11 +162,9 @@ trmm
 
 
 .. container:: section
-   :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24
 
 
    .. rubric:: Notes
-      :name: notes
       :class: sectiontitle
 
 
@@ -194,15 +172,137 @@ trmm
    not need to be initialized at entry.
 
 
-.. container:: familylinks
+trmm (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::trmm(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, T* b, std::int64_t ldb, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      left_right
+         Specifies whether ``A`` is on the left side of the
+         multiplication (``side::left``) or on the right side
+         (``side::right``). See :ref:`onemkl_datatypes` for more details.
+
+
+      uplo
+         Specifies whether the matrix ``A`` is upper or lower
+         triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
+
+      unit_diag
+         Specifies whether ``A`` is assumed to be unit triangular (all
+         diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
+
+
+      m
+         Specifies the number of rows of ``B``. The value of ``m`` must
+         be at least zero.
 
+
+      n
+         Specifies the number of columns of ``B``. The value of ``n``
+         must be at least zero.
+
+
+      alpha
+         Scaling factor for the matrix-matrix product.
+
+
+      a
+         Pointer to input matrix ``A``. Must have size at least
+         ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or
+         ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``m`` if
+         ``left_right`` = ``side::left``, and at least ``n`` if
+         ``left_right`` = ``side::right``. Must be positive.
+
+
+      b
+         Pointer to input/output matrix ``B``. Must have size at least
+         ``ldb``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldb
+         Leading dimension of ``B``. Must be at least ``m`` and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      b
+         Pointer to the output matrix, overwritten by
+         ``alpha``\ \*op(``A``)\*\ ``B`` or
+         ``alpha``\ \*\ ``B``\ \*op(``A``).
+
+
+   .. container:: section
+
+
+      .. rubric:: Notes
+         :class: sectiontitle
+
+
+      If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B``
+      do not need to be initialized at entry.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/trmv.rst b/docs/domains/blas/trmv.rst
index 14476e1e8..f015007ee 100644
--- a/docs/domains/blas/trmv.rst
+++ b/docs/domains/blas/trmv.rst
@@ -1,4 +1,4 @@
-.. _trmv:
+.. _onemkl_blas_trmv:
 
 trmv
 ====
@@ -10,16 +10,6 @@ trmv
    Computes a matrix-vector product using a triangular matrix.
 
 
-   .. container:: section
-      :name: GUID-15041079-C2F5-4D3C-85C2-262E184F7FFE
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void trmv(queue &exec_queue, uplo upper_lower,      transpose trans, diag unit_nonunit, std::int64_t n, buffer<T,1>      &a, std::int64_t lda, buffer<T,1> &x, std::int64_t incx)
 
       ``trmv`` supports the following precisions.
 
@@ -37,21 +27,16 @@ trmv
 
 
 .. container:: section
-   :name: GUID-420DC613-E11B-48A8-B73F-55B55EBFC3B7
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The trmv routines compute a matrix-vector product with a triangular
+   The ``trmv`` routines compute a matrix-vector product with a triangular
    matrix. The operation is defined
 
 
-  
-
-
       x <- op(A)*x
 
 
@@ -69,36 +54,40 @@ trmv
    ``x`` is a vector of length ``n``.
 
 
+trmv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t incx)
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See
-      :ref:`onemkl_datatypes`
-      for more details.
+      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -129,11 +118,9 @@ trmv
 
 
 .. container:: section
-   :name: GUID-7BF1D5C9-EB8C-4BD6-B0E7-A66DAC3221F9
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -141,15 +128,105 @@ trmv
       Buffer holding the updated vector ``x``.
 
 
-.. container:: familylinks
+trmv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
 
+
+      unit_nonunit
+         Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Numbers of rows and columns of ``A``. Must be at least zero.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``n``, and
+         positive.
+
+
+      x
+         Pointer to input vector ``x``. The array holding input vector
+         ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
+         See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the updated vector ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/trsm.rst b/docs/domains/blas/trsm.rst
index 958d231db..ec5e0ede5 100644
--- a/docs/domains/blas/trsm.rst
+++ b/docs/domains/blas/trsm.rst
@@ -1,4 +1,4 @@
-.. _trsm:
+.. _onemkl_blas_trsm:
 
 trsm
 ====
@@ -10,18 +10,8 @@ trsm
    Solves a triangular matrix equation (forward or backward solve).
 
 
-   .. container:: section
-      :name: GUID-6F8E0E22-B30A-4825-B508-CEDE0CAC8B90
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
 
-      .. cpp:function::  void trsm(queue &exec_queue, side left_right,      uplo upper_lower, transpose transa, diag unit_diag, std::int64_t      m, std::int64_t n, T alpha, buffer<T,1> &a, std::int64_t lda,      buffer<T,1> &b, std::int64_t ldb)
-
-      trsm supports the following precisions.
+      ``trsm`` supports the following precisions.
 
 
       .. list-table:: 
@@ -37,18 +27,14 @@ trsm
 
 
 .. container:: section
-   :name: GUID-AE6CFEF4-4058-49C3-BABC-2B05D6594555
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The trsm routines solve one of the following matrix equations:
-
+   The ``trsm`` routines solve one of the following matrix equations:
 
-  
 
 
       op(A)*X = alpha*B,
@@ -57,9 +43,6 @@ trsm
    or
 
 
-  
-
-
       X*op(A) = alpha*B,
 
 
@@ -84,45 +67,46 @@ trsm
    is overwritten by the solution matrix ``X``.
 
 
+trsm (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb)
 .. container:: section
-   :name: GUID-0BBDCB60-8CDE-4EBD-BDE5-F7688B4B29F4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    left_right
       Specifies whether ``A`` multiplies ``X`` on the left
-      (``side::left``) or on the right (``side::right``). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details.
 
 
    uplo
-      Specifies whether the matrix ``A`` is upper or lower triangular.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether the matrix ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_diag
       Specifies whether ``A`` is assumed to be unit triangular (all
-      diagonal elements are 1). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
 
 
    m
@@ -166,11 +150,9 @@ trsm
 
 
 .. container:: section
-   :name: GUID-7AC6C3B9-7A31-4E0B-B770-FD607E7F9BE5
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -179,11 +161,9 @@ trsm
 
 
 .. container:: section
-   :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24
 
 
    .. rubric:: Notes
-      :name: notes
       :class: sectiontitle
 
 
@@ -191,15 +171,136 @@ trsm
    not need to be initialized at entry.
 
 
-.. container:: familylinks
+trsm (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-3-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, T* b, std::int64_t ldb, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      left_right
+         Specifies whether ``A`` multiplies ``X`` on the left
+         (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details.
+
+
+      uplo
+         Specifies whether the matrix ``A`` is upper or lower
+         triangular. See :ref:`onemkl_datatypes` for more details.
+
+
+      transa
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
+
+
+      unit_diag
+         Specifies whether ``A`` is assumed to be unit triangular (all
+         diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
+
+
+      m
+         Specifies the number of rows of ``B``. The value of ``m`` must
+         be at least zero.
 
+
+      n
+         Specifies the number of columns of ``B``. The value of ``n``
+         must be at least zero.
+
+
+      alpha
+         Scaling factor for the solution.
+
+
+      a
+         Pointer to input matrix ``A``. Must have size at least
+         ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or
+         ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of ``A``. Must be at least ``m`` if
+         ``left_right`` = ``side::left``, and at least ``n`` if
+         ``left_right`` = ``side::right``. Must be positive.
+
+
+      b
+         Pointer to input/output matrix ``B``. Must have size at least
+         ``ldb``\ \*\ ``n``. See `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      ldb
+         Leading dimension of ``B``. Must be at least ``m`` and
+         positive.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      b
+         Pointer to the output matrix. Overwritten by the solution
+         matrix ``X``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Notes
+         :class: sectiontitle
+
+
+      If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B``
+      do not need to be initialized at entry.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/trsm_batch.rst b/docs/domains/blas/trsm_batch.rst
index 0cf82ce21..c6710ee74 100644
--- a/docs/domains/blas/trsm_batch.rst
+++ b/docs/domains/blas/trsm_batch.rst
@@ -1,4 +1,4 @@
-.. _trsm_batch:
+.. _onemkl_blas_trsm_batch:
 
 trsm_batch
 ==========
@@ -6,28 +6,10 @@ trsm_batch
 
 .. container::
 
-
-   Computes groups of matrix-matrix product with general matrices.
-
-
-   .. container:: section
-      :name: GUID-6F8E0E22-B30A-4825-B508-CEDE0CAC8B90
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      **Group API**
-
-
-      .. cpp:function::  void trsm_batch(queue &exec_queue, buffer<side,      1> &left_right_array, buffer<uplo,1> &upper_lower_array,      buffer<transpose,1> &trans_array, buffer<diag,1> &unit_diag_array,      buffer<std::int64_t,1> &m_array, buffer<std::int64_t,1> &n_array,      buffer<T,1> &alpha_array, buffer<T,1> &a_array,      buffer<std::int64_t,1> &lda_array, buffer<T,1> &b_array,      buffer<std::int64_t,1> ldb_array, std::int64_t group_count,      buffer<std::int64_t,1> &group_size_array)
-
-      **Strided API**
-
-
-      .. cpp:function::  void trsm_batch(queue &exec_queue, side      left_right, uplo upper_lower, transpose trans, diag unit_diag,      std::int64_t m, std::int64_t n, T alpha, buffer<T,1> &a,      std::int64_t lda, std::int64_t stridea, buffer<T,1> &b,      std::int64_t ldb, std::int64_t strideb, std::int64_t batch_size)
+   The ``trsm_batch`` routines are batched versions of `trsm <trsm.html>`__, performing
+   multiple ``trsm`` operations in a single call. Each ``trsm`` 
+   solves an equation of the form op(A) \* X = alpha \* B or X \* op(A) = alpha \* B. 
+   
 
       ``trsm_batch`` supports the following precisions.
 
@@ -42,60 +24,28 @@ trsm_batch
          * -  ``std::complex<double>`` 
 
 
-
+trsm_batch (Buffer Version)
+---------------------------
 
 .. container:: section
-   :name: GUID-AE6CFEF4-4058-49C3-BABC-2B05D6594555
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
-
-   The trsm_batch routines solve a series of equations of the form op(A)
-   \* X = alpha \* B or X \* op(A) = alpha \* B. They are similar to the
-   trsm routine counterparts, but the trsm_batch routines solve linear
-   equations with groups of matrices. The groups contain matrices with
-   the same parameters.
-
-
-   For the group API, the operation is defined as
-
-
-   ::
-
-
-      offa = 0, offb = 0
-      for i = 0 … group_count – 1
-          left_right, uplo, trans, unit_diag, m, n, lda, ldb, alpha and group_size at position i in left_right_array, uplo_array, trans_array, unit_diag_array, m_array, n_array, lda_array, ldb_array, alpha_array and group_size_array
-          sizea = left_right == onemkl::side::L ? lda * m : lda * n;
-          sizeb = ldb * n;
-          for j = 0 … group_size – 1
-              A and B are matrices of size sizea and sizeb at offset offa and offb in a and b.
-              if (left_right == onemkl::side::L) then
-                  computes X such that op(A) * X = alpha * B
-              else
-                  computes X such that X * op(A) = alpha * B
-              end if
-              B := X
-              offa += sizea, offb += sizeb
-          end for
-      end for     
-
-
-   For the strided API, the operation is defined as
-
+   The buffer version of ``trsm_batch`` supports only the strided API. 
+   
+   The strided API operation is defined as
 
    ::
 
 
       for i = 0 … batch_size – 1
           A and B are matrices at offset i * stridea and i * strideb in a and b.
-          if (left_right == onemkl::side::L) then
-              computes X such that op(A) * X = alpha * B
+          if (left_right == onemkl::side::left) then
+              compute X such that op(A) * X = alpha * B
           else
-              computes X such that X * op(A) = alpha * B
+              compute X such that X * op(A) = alpha * B
           end if
           B := X
       end for
@@ -104,215 +54,69 @@ trsm_batch
    where:
 
 
-   -  op(``A``) is one of op(``A``) = ``A``, or op(A) = ``A``\ :sup:`T`,
-      or op(``A``) = ``A``\ :sup:`H`
+   op(``A``) is one of op(``A``) = ``A``, or op(A) = ``A``\ :sup:`T`,
+   or op(``A``) = ``A``\ :sup:`H`
 
 
-   -  alpha is a scalar
+   ``alpha`` is a scalar
 
 
-   -  ``A`` is a triangular matrix
+   ``A`` is a triangular matrix
 
 
-   -  ``B`` and ``X`` are ``m`` x ``n`` general matrices
-
-
-   -  The a and b buffers contains all the input matrices. The stride
-      between matrices is either given by the exact size of the matrix
-      (for the group API) or by the stride parameter. The total number
-      of matrices in a and b is given by the 
-      
-      |image0| 
-      
-      for the strided
-      API.
+   ``B`` and ``X`` are ``m`` x ``n`` general matrices
 
 
    ``A`` is either ``m`` x ``m`` or ``n`` x ``n``,depending on whether
-   it multiplies ``X`` on the leftor right. On return, the matrix ``B``
-   is overwrittenby the solution matrix ``X``.
-
-
-.. container:: section
-   :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A
-
-
-   .. rubric:: Input Parameters - Group API
-      :name: input-parameters---group-api
-      :class: sectiontitle
-
-
-   left_right_array
-      Buffer holding ``group_count onemkl::side`` value.
-
-
-      For the group ``i``, ``left_right`` is the ``i``\ th element in
-      the left_right_array buffer and specifies whether ``A`` multiplies
-      ``X`` on the left (``side::left``) or on the right
-      (``side::right``). See
-      :ref:`onemkl_datatypes` for more
-      details.
-
-
-   uplo_array
-      Buffer holding ``group_count onemkl::uplo`` value.
-
-
-      For the group ``i``, ``uplo`` is the ``i``\ th element in the
-      uplo_array buffer and specifies whether ``A`` is upper or lower
-      triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
-
-
-   trans_array
-      Buffer holding ``group_count onemkl::transpose`` value.
-
-
-      For the group ``i``, ``trans`` is the ``i``\ th element in the
-      trans_array buffer and specifies the form of ``op``\ (``A``) used
-      in the matrix multiplication. See
-      :ref:`onemkl_datatypes` for more
-      details.
-
-
-   unit_diag__array
-      Buffer holding ``group_count onemkl::diag`` value.
-
-
-      For the group ``i``, ``unit_diag`` is the ``i``\ th element in the
-      unit_diag_array buffer and specifies whether ``A`` is assumed to
-      be unit triangular (all diagonal elements are 1). See
-      :ref:`onemkl_datatypes` for more
-      details.
-
+   it multiplies ``X`` on the left or right. On return, the matrix ``B``
+   is overwritten by the solution matrix ``X``.
 
-   m_array
-      Buffer holding ``group_count`` integer. For the group ``i``, ``m``
-      is the ``i``\ th element in the m_array buffer and specifies the
-      number of rows of ``B``. Must be at least zero.
-
-
-   n_array
-      Buffer holding ``group_count`` integer. For the group ``i``, ``n``
-      is the ``i``\ th element in the n_array buffer and specifies the
-      number of columns of ``B``. Must be at least zero.
-
-
-   alpha_array
-      Buffer holding ``group_count`` scalar element. For the group
-      ``i``, ``alpha`` is the ``i``\ th element in the alpha_array
-      buffer and specifies the scaling factor for the matrix-matrix
-      product.
-
-
-   a
-      Buffer holding the input matrix ``A``. The total size of the
-      buffer ``a`` must be at least the sum of the sizes of all the
-      matricies ``A``. That is,
-
-
-      |image1|
-
-
-      where
-      ``sizeai = lda_array[i] * (left_right == onemkl::side::L ? m : n)``
-
-
-      See `Matrix
-      Storage <../matrix-storage.html>`__ for
-      more details.
-
-
-   lda_array
-      Buffer holding ``group_count`` integer. For the group ``i``,
-      ``lda`` is the ``i``\ th element in the lda_array buffer and
-      specifies the leading dimension of ``A``. Must be at least ``m``
-      if ``A`` is not transposed, and at least ``k`` if ``A`` is
-      transposed. Must be positive.
-
-
-   b
-      Buffer holding the input matrix ``B``. The total size of the
-      buffer ``b`` must be at least the sum of the sizes of all the
-      matricies ``B``. That is,
-
-
-      |image2|
-
-
-      See `Matrix
-      Storage <../matrix-storage.html>`__ for
-      more details.
 
+   The a and b buffers contain all the input matrices. The stride 
+   between matrices is given by the stride parameter. The total number
+   of matrices in a and b buffers are given by the ``batch_size`` parameter.
+      
 
-   ldb_array
-      Buffer holding ``group_count`` integer. For the group ``i``,
-      ``ldb`` is the ``i``\ th element in the ldb_array buffer and
-      specifies the leading dimension of ``B``. Must be at least ``n``.
-      Must be positive.
+   **Strided API**
 
+   .. container:: section
 
-   group_count
-      Specifies the number of groups. Must be at least 0.
 
+      .. rubric:: Syntax
+         :class: sectiontitle
 
-   group_size_array
-      Buffer holding the group_count integer. For the group ``i``,
-      ``ldb`` is the ``i``\ th element in the group_size_array buffer
-      specifies the number of matrix multiply operations in
-      group\ ``i``. Each element in group_size_array must be at least 0.
+      .. cpp:function::  void onemkl::blas::trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, sycl::buffer<T,1> &a, std::int64_t lda, std::int64_t stridea, sycl::buffer<T,1> &b, std::int64_t ldb, std::int64_t strideb, std::int64_t batch_size)
 
 
 .. container:: section
-   :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700
 
 
-   .. rubric:: Output Parameters - Group API
-      :name: output-parameters---group-api
+   .. rubric:: Input Parameters
       :class: sectiontitle
 
 
-   b
-      Output buffer, overwritten by the ``total_batch_count`` solution
-      matrices ``X``.
-
-
-.. container:: section
-   :name: GUID-D067773A-45A3-4D24-B10A-46E27834947E
-
-
-   .. rubric:: Input Parameters - Strided API
-      :name: input-parameters---strided-api
-      :class: sectiontitle
+   queue
+      The queue where the routine should be executed.
 
 
    left_right
       Specifies whether the matrices ``A`` multiply ``X`` on the left
-      (``side::left``) or on the right (``side::right``). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details.
 
 
-   uplo
+   upper_lower
       Specifies whether the matrices ``A`` are upper or lower
-      triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
       Specifies ``op(A)``, the transposition operation applied to the
-      matrices ``A``. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_diag
       Specifies whether the matrices ``A`` are assumed to be unit
-      triangular (all diagonal elements are 1.). See
-      :ref:`onemkl_datatypes` for more
-      details.
+      triangular (all diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
 
 
    m
@@ -328,59 +132,30 @@ trsm_batch
 
 
    a
-      Buffer holding the input matrices ``A``. Must have size at least
-      ``stridea*batch_size``.
+      Buffer holding the input matrices ``A`` with size ``stridea*batch_size``.
 
 
    lda
       Leading dimension of the matrices ``A``. Must be at least ``m`` if
-      left_right = ``side::left``, and at least ``n`` if left_right =
+      ``left_right`` = ``side::left``, and at least ``n`` if ``left_right`` =
       ``side::right``. Must be positive.
 
 
    stridea
-      Stride between the different ``A`` matrices.
-
-
-      If left_right = ``side::left``, the matrices ``A`` are
-      ``m``-by-``m`` matrices, so stridea must be at least lda\*\ ``m``.
-
-
-      If left_right = ``side::right``, the matrices ``A`` are
-      ``n``-by-``n`` matrices, so stridea must be at least lda\*\ ``n``.
+      Stride between different ``A`` matrices.
 
 
    b
-      Buffer holding the input matrices ``B``. Must have size at least
-      ``strideb*batch_size``.
+      Buffer holding the input matrices ``B`` with size ``strideb*batch_size``.
 
 
    ldb
-      Leading dimension of the matrices ``B``. Must be at least ``m``
-      and must be positive.
+      Leading dimension of the matrices ``B``. Must be at least ``m``. 
+      Must be positive.
 
 
    strideb
-      Stride between the different ``B`` matrices. Must be at least
-      ldb\*\ ``n``.
-
-
-   beta
-      Scaling factor for the matrices ``C``.
-
-
-   c
-      Buffer holding input/output matrices ``C``. Must have size at
-      least ``stridec*batch_size``.
-
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least ``m``.
-
-
-   stridec
-      Stride between the different ``C`` matrices. Must be at least
-      ``ldc*n``.
+      Stride between different ``B`` matrices.
 
 
    batch_size
@@ -388,11 +163,9 @@ trsm_batch
 
 
 .. container:: section
-   :name: GUID-98C3DE17-4F5F-41A1-B431-48148153ABBA
 
 
-   .. rubric:: Output Parameters - Strided API
-      :name: output-parameters---strided-api
+   .. rubric:: Output Parameters
       :class: sectiontitle
 
 
@@ -402,16 +175,15 @@ trsm_batch
 
 
 .. container:: section
-   :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF
 
 
    .. rubric:: Notes
-      :name: notes
       :class: sectiontitle
 
 
-   If ``alpha`` = 0, matrix ``B`` is set to zero, and the matrices ``A``
-   and ``B`` do not need to be initialized before calling trsm_batch.
+   If ``alpha`` = 0, matrix ``B`` is set to zero and the matrices ``A``
+   and ``B`` do not need to be initialized before calling ``trsm_batch``.
+
 
 
 .. container:: familylinks
@@ -423,14 +195,3 @@ trsm_batch
       **Parent topic:** :ref:`blas-like-extensions`
       
 
-
-.. container::
-
-
-.. |image0| image:: ../equations/GUID-D352DB8F-BC76-4A5E-A7CA-5B4CAAA90ee1.png
-   :class: img-middle
-.. |image1| image:: ../equations/GUID-D352DB8F-BC76-4A5E-A7CA-5B4CAAA90ee2.png
-   :class: img-middle
-.. |image2| image:: ../equations/GUID-D352DB8F-BC76-4A5E-A7CA-5B4CAAA90ee3.png
-   :class: img-middle
-
diff --git a/docs/domains/blas/trsv.rst b/docs/domains/blas/trsv.rst
index e1dba6e43..8662d3197 100644
--- a/docs/domains/blas/trsv.rst
+++ b/docs/domains/blas/trsv.rst
@@ -1,4 +1,4 @@
-.. _trsv:
+.. _onemkl_blas_trsv:
 
 trsv
 ====
@@ -11,16 +11,6 @@ trsv
    triangular matrix.
 
 
-   .. container:: section
-      :name: GUID-9BA4C1B6-479B-41B1-BCA8-7826F40DA952
-
-
-      .. rubric:: Syntax
-         :name: syntax
-         :class: sectiontitle
-
-
-      .. cpp:function::  void trsv(queue &exec_queue, uplo upper_lower,      transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t      k, buffer<T,1> &a, std::int64_t lda, buffer<T,1> &x, std::int64_t      incx)
 
       ``trsv`` supports the following precisions.
 
@@ -38,20 +28,16 @@ trsv
 
 
 .. container:: section
-   :name: GUID-D500B67B-5DD6-4471-B0BD-53FD9A3C7BF2
 
 
    .. rubric:: Description
-      :name: description
       :class: sectiontitle
 
 
-   The trsv routines compute a matrix-vector product with a triangular
+   The ``trsv`` routines compute a matrix-vector product with a triangular
    band matrix. The operation is defined as
 
 
-  
-
 
       op(A)*x = b
 
@@ -70,36 +56,41 @@ trsv
    ``b`` and ``x`` are vectors of length ``n``.
 
 
+trsv (Buffer Version)
+---------------------
+
+.. container::
+
+   .. container:: section
+
+
+      .. rubric:: Syntax
+         :class: sectiontitle
+
+
+      .. cpp:function::  void onemkl::blas::trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &x, std::int64_t      incx)
+
 .. container:: section
-   :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4
 
 
    .. rubric:: Input Parameters
-      :name: input-parameters
       :class: sectiontitle
 
 
-   exec_queue
+   queue
       The queue where the routine should be executed.
 
 
    upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
 
 
    trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
+      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
 
 
    unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See
-      :ref:`onemkl_datatypes`
-      for more details.
+      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
 
 
    n
@@ -131,11 +122,9 @@ trsv
 
 
 .. container:: section
-   :name: GUID-7E0AF44F-2D83-41A3-A58E-50400ECDBD9A
 
 
    .. rubric:: Output Parameters
-      :name: output-parameters
       :class: sectiontitle
 
 
@@ -143,15 +132,105 @@ trsv
       Buffer holding the solution vector ``x``.
 
 
-.. container:: familylinks
+trsv (USM Version)
+------------------
 
+.. container::
 
-   .. container:: parentlink
+   .. container:: section
 
 
-      **Parent topic:** :ref:`blas-level-2-routines`
-      
+      .. rubric:: Syntax
+         :class: sectiontitle
 
 
-.. container::
+      .. container:: dlsyntaxpara
+
+
+         .. cpp:function::  sycl::event onemkl::blas::trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class<sycl::event> &dependencies = {})
+   .. container:: section
+
+
+      .. rubric:: Input Parameters
+         :class: sectiontitle
+
+
+      queue
+         The queue where the routine should be executed.
+
+
+      upper_lower
+         Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
+
+      trans
+         Specifies op(``A``), the transposition operation applied to
+         ``A``. See :ref:`onemkl_datatypes` for more details.
 
+
+      unit_nonunit
+         Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
+
+
+      n
+         Numbers of rows and columns of ``A``. Must be at least zero.
+
+
+      a
+         Pointer to input matrix ``A``. The array holding input matrix
+         ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix
+         and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      lda
+         Leading dimension of matrix ``A``. Must be at least ``n``, and
+         positive.
+
+
+      x
+         Pointer to the ``n``-element right-hand side vector ``b``. The
+         array holding the ``n``-element right-hand side vector ``b``
+         must be of size at least (1 + (``n`` - 1)*abs(``incx``)). See
+         `Matrix and Vector
+         Storage <../matrix-storage.html>`__ for
+         more details.
+
+
+      incx
+         Stride of vector ``x``.
+
+
+      dependencies
+         List of events to wait for before starting computation, if any.
+         If omitted, defaults to no dependencies.
+
+
+   .. container:: section
+
+
+      .. rubric:: Output Parameters
+         :class: sectiontitle
+
+
+      x
+         Pointer to the solution vector ``x``.
+
+
+   .. container:: section
+
+
+      .. rubric:: Return Values
+         :class: sectiontitle
+
+
+      Output event to wait on to ensure computation is complete.
+
+
+.. container:: familylinks
+
+
+   .. container:: parentlink
+
+
+      **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/include/onemkl/blas/blas.hpp b/include/onemkl/blas/blas.hpp
index d06042049..acc1111d6 100644
--- a/include/onemkl/blas/blas.hpp
+++ b/include/onemkl/blas/blas.hpp
@@ -45,6 +45,8 @@
 namespace onemkl {
 namespace blas {
 
+// Buffer APIs
+
 static inline void asum(cl::sycl::queue &queue, std::int64_t n,
                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
                         cl::sycl::buffer<float, 1> &result) {
@@ -298,76 +300,6 @@ static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose tran
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                              cl::sycl::buffer<transpose, 1> &transb,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<std::int64_t, 1> &k,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb,
-                              cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                              cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                       beta, c, ldc, group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                       beta, c, ldc, group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                       beta, c, ldc, group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                       beta, c, ldc, group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
 static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
                               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
                               cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -1660,72 +1592,6 @@ static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lowe
                        ldb);
 }
 
-static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                              cl::sycl::buffer<uplo, 1> &upper_lower,
-                              cl::sycl::buffer<transpose, 1> &trans,
-                              cl::sycl::buffer<diag, 1> &unit_diag,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m,
-                       n, alpha, a, lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m,
-                       n, alpha, a, lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m,
-                       n, alpha, a, lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m,
-                       n, alpha, a, lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
 static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
                               float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -1814,6 +1680,1939 @@ static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose tran
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
+// USM APIs
+
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done =
+        detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done =
+        detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done =
+        detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done =
+        detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+    float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy,
+                                   group_count, group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
+    double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy,
+                                   group_count, group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy,
+                                   group_count, group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy,
+                                   group_count, group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event dot(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::dot(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event dot(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::dot(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event dot(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::dot(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::dotc(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::dotc(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::dotu(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::dotu(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x,
+    std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda,
+                             b, ldb, beta, c, ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda,
+                             b, ldb, beta, c, ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda,
+                             b, ldb, beta, c, ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda,
+                             b, ldb, beta, c, ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b,
+    std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done =
+        detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
+                           ldb, beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b,
+    std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done =
+        detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
+                           ldb, beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
+    const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done =
+        detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
+                           ldb, beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
+    const std::complex<double> **b, std::int64_t *ldb, std::complex<double> *beta,
+    std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done =
+        detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
+                           ldb, beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+    const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a,
+                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                   batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+    const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c,
+    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a,
+                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                   batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a,
+                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                   batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a,
+                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                   batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k,
+                              alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k,
+                              alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k,
+                              alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k,
+                              alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event ger(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::ger(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
+                            dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event ger(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::ger(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
+                            dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::gerc(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
+                             dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::gerc(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
+                             dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::geru(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
+                             dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::geru(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
+                             dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::hbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::hbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hemm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::hemm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a,
+                             lda, b, ldb, beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hemm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::hemm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a,
+                             lda, b, ldb, beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hemv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::hemv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hemv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::hemv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event her(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = detail::her(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
+                            dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event her(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = detail::her(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
+                            dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event her2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::her2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event her2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::her2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event her2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = detail::her2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                              b, ldb, beta, c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+static inline cl::sycl::event her2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = detail::her2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                              b, ldb, beta, c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+static inline cl::sycl::event herk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const std::complex<float> *a, std::int64_t lda, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = detail::herk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                             beta, c, ldc, dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event herk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = detail::herk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                             beta, c, ldc, dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hpmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done = detail::hpmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
+                             y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hpmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done = detail::hpmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
+                             y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hpr(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done =
+        detail::hpr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hpr(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done =
+        detail::hpr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hpr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done = detail::hpr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event hpr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done = detail::hpr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rot(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rot(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rot(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rot(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
+    std::complex<double> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done =
+        detail::rotm(select_backend(queue), queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done =
+        detail::rotm(select_backend(queue), queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotmg(
+    cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = detail::rotmg(select_backend(queue), queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event rotmg(
+    cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = detail::rotmg(select_backend(queue), queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event sbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::sbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event sbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = detail::sbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x,
+                             incx, beta, y, incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event sdsdot(
+    cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
+    const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    auto done =
+        detail::sdsdot(select_backend(queue), queue, n, sb, x, incx, y, incy, result, dependencies);
+    sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event spmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done = detail::spmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
+                             y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event spmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done = detail::spmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
+                             y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event spr(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done =
+        detail::spr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event spr(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done =
+        detail::spr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event spr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done = detail::spr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event spr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done = detail::spr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a,
+                             lda, b, ldb, beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a,
+                             lda, b, ldb, beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a,
+                             lda, b, ldb, beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a,
+                             lda, b, ldb, beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+static inline cl::sycl::event symv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::symv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event symv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = detail::symv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
+                             beta, y, incy, dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = detail::syr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
+                            dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = detail::syr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
+                            dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::syr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = detail::syr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
+                             a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                              b, ldb, beta, c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                              b, ldb, beta, c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                              b, ldb, beta, c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                              b, ldb, beta, c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                             beta, c, ldc, dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                             beta, c, ldc, dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                             beta, c, ldc, dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
+                             beta, c, ldc, dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a,
+                             lda, x, incx, dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x,
+                             incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans,
+                             unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {}) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
+                             x, incx, dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
 } //namespace blas
 } //namespace onemkl
 
diff --git a/include/onemkl/blas/detail/blas_ct_templates.hpp b/include/onemkl/blas/detail/blas_ct_templates.hpp
new file mode 100644
index 000000000..765dcd788
--- /dev/null
+++ b/include/onemkl/blas/detail/blas_ct_templates.hpp
@@ -0,0 +1,2089 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+//
+// Generated based on onemkl/blas/blas.hpp
+//
+
+#ifndef _DETAIL_COMMON_BLAS_HPP__
+#define _DETAIL_COMMON_BLAS_HPP__
+
+#include <CL/sycl.hpp>
+#include <cstdint>
+
+#include "onemkl/detail/backends.hpp"
+#include "onemkl/detail/libraries.hpp"
+#include "onemkl/types.hpp"
+
+namespace onemkl {
+namespace blas {
+
+// Buffer APIs
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<double, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<float, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<double, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
+                        std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
+                        std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                        std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
+                        std::int64_t lda, float beta, cl::sycl::buffer<float, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                        std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
+                        std::int64_t lda, double beta, cl::sycl::buffer<double, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                        std::int64_t k, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                        std::int64_t k, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx, std::complex<float> beta,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx, std::complex<double> beta,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rot(cl::sycl::queue &queue, std::int64_t n,
+                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
+                       float s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rot(cl::sycl::queue &queue, std::int64_t n,
+                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
+                       double s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy, float c,
+                       float s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
+                       double c, double s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                         std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
+                         std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
+                         float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                         std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
+                         std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
+                         double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                         std::int64_t k, std::complex<float> alpha,
+                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
+                         std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+                         std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                         std::int64_t k, std::complex<double> alpha,
+                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+                         std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                         std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
+                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
+                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx, std::complex<float> beta,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx, std::complex<double> beta,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<std::complex<float>, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<std::complex<double>, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
+                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                         cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
+                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                         cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                              cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, float beta,
+                              cl::sycl::buffer<float, 1> &c, std::int64_t ldc,
+                              std::int64_t stride_c, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                              cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, double beta,
+                              cl::sycl::buffer<double, 1> &c, std::int64_t ldc,
+                              std::int64_t stride_c, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k,
+                              std::complex<float> alpha,
+                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
+                              cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
+                              std::int64_t stride_c, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k,
+                              std::complex<double> alpha,
+                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
+                              cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
+                              std::int64_t stride_c, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
+                        std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y,
+                        std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
+                        std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y,
+                        std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
+                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
+                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, float beta,
+                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
+                            offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
+                            float alpha, cl::sycl::buffer<int8_t, 1> &a, std::int64_t lda,
+                            int8_t ao, cl::sycl::buffer<uint8_t, 1> &b, std::int64_t ldb,
+                            uint8_t bo, float beta, cl::sycl::buffer<int32_t, 1> &c,
+                            std::int64_t ldc, cl::sycl::buffer<int32_t, 1> &co);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
+                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                            cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                            cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
+                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
+                            std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                            cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                            cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
+                            cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
+                            std::int64_t m, std::int64_t n, std::int64_t k,
+                            std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+                            std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
+                            std::int64_t ldb, std::complex<float> beta,
+                            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
+                            std::int64_t m, std::int64_t n, std::int64_t k,
+                            std::complex<double> alpha,
+                            cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+                            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                            std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
+                            std::int64_t m, std::int64_t n, std::int64_t k, half alpha,
+                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
+                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
+                            cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void swap(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void swap(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<float, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<double, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                        std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
+                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                        std::int64_t n, std::int64_t k, double alpha,
+                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
+                        cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                        std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
+                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                        std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                        std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer<half, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
+                        cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                        std::int64_t k, float alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, float beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                        std::int64_t k, double alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda, double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<float, 1> &y, std::int64_t incy,
+                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<double, 1> &y, std::int64_t incy,
+                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<std::complex<float>, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<std::complex<double>, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                        std::int64_t n, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
+                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                        std::int64_t n, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
+                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        std::int64_t kl, std::int64_t ku, float alpha,
+                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
+                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        std::int64_t kl, std::int64_t ku, double alpha,
+                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
+                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
+                        std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                        std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
+                        std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                        std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
+                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                        std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
+                        double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                        std::int64_t n, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
+                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                        std::int64_t n, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                        std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<std::complex<float>, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<std::complex<double>, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &d1,
+                         cl::sycl::buffer<float, 1> &d2, cl::sycl::buffer<float, 1> &x1, float y1,
+                         cl::sycl::buffer<float, 1> &param);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &d1,
+                         cl::sycl::buffer<double, 1> &d2, cl::sycl::buffer<double, 1> &x1,
+                         double y1, cl::sycl::buffer<double, 1> &param);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void copy(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void copy(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx, std::complex<float> beta,
+                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
+                        std::int64_t incx, std::complex<double> beta,
+                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                         transpose transb, std::int64_t n, std::int64_t k, float alpha,
+                         cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                         cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
+                         cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                         transpose transb, std::int64_t n, std::int64_t k, double alpha,
+                         cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                         cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
+                         cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                         transpose transb, std::int64_t n, std::int64_t k,
+                         std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+                         std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
+                         std::int64_t ldb, std::complex<float> beta,
+                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                         transpose transb, std::int64_t n, std::int64_t k,
+                         std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+                         std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b,
+                         std::int64_t ldb, std::complex<double> beta,
+                         cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
+                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
+                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void asum(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<float, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void asum(cl::sycl::queue &queue, std::int64_t n,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<double, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
+                        std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                        std::int64_t n, std::int64_t k,
+                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<float, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<double, 1> &a);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
+                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+                         cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
+                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+                         cl::sycl::buffer<std::int64_t, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              std::complex<float> alpha,
+                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              std::complex<double> alpha,
+                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
+                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<float, 1> &param);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
+                        cl::sycl::buffer<double, 1> &param);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &a,
+                        cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<float, 1> &c,
+                        cl::sycl::buffer<float, 1> &s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &a,
+                        cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<double, 1> &c,
+                        cl::sycl::buffer<double, 1> &s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<float>, 1> &a,
+                        cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<float, 1> &c,
+                        cl::sycl::buffer<std::complex<float>, 1> &s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<double>, 1> &a,
+                        cl::sycl::buffer<std::complex<double>, 1> &b,
+                        cl::sycl::buffer<double, 1> &c,
+                        cl::sycl::buffer<std::complex<double>, 1> &s);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb,
+                          cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+                          cl::sycl::buffer<float, 1> &y, std::int64_t incy,
+                          cl::sycl::buffer<float, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                         std::int64_t k, std::complex<float> alpha,
+                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
+                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                         std::int64_t k, std::complex<double> alpha,
+                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+                         double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+                         std::int64_t ldc);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
+                       cl::sycl::buffer<float, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
+                       cl::sycl::buffer<double, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
+                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
+                       cl::sycl::buffer<double, 1> &result);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
+                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
+                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
+// USM APIs
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, float *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, double *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hpmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hpmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event her2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event her2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                                  std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                                  float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                                  std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                                  double c, double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x,
+                                  std::int64_t incx, float *y, std::int64_t incy, float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x,
+                                  std::int64_t incx, double *y, std::int64_t incy, double c,
+                                  double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+    float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
+    double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const std::complex<float> *x, std::int64_t incx,
+                                  std::complex<float> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const std::complex<double> *x, std::int64_t incx,
+                                  std::complex<double> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const std::complex<float> *x, std::int64_t incx,
+                                  std::complex<float> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const std::complex<double> *x, std::int64_t incx,
+                                  std::complex<double> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b,
+    std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b,
+    std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
+    const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
+    const std::complex<double> **b, std::int64_t *ldb, std::complex<double> *beta,
+    std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+    const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+    const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c,
+    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event spmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event spmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event herk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const std::complex<float> *a, std::int64_t lda, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event herk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, const float *y,
+                                  std::int64_t incy, float *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, const double *y,
+                                  std::int64_t incy, double *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hemm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hemm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hpr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hpr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x,
+    std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, float *a,
+                                  std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, double *a,
+                                  std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotmg(
+    cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotmg(
+    cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hemv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event hemv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event sbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event sbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event spr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event spr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
+    std::complex<double> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event sdsdot(
+    cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
+    const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event her2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event her2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                  std::int64_t incx, const float *y, std::int64_t incy,
+                                  float *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                  std::int64_t incx, const double *y, std::int64_t incy,
+                                  double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                  std::int64_t incx, const float *y, std::int64_t incy,
+                                  double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event symv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+template <onemkl::library lib, onemkl::backend backend>
+static inline cl::sycl::event symv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+} //namespace blas
+} //namespace onemkl
+
+#endif //_DETAIL_COMMON_BLAS_HPP__
diff --git a/include/onemkl/blas/detail/blas_loader.hpp b/include/onemkl/blas/detail/blas_loader.hpp
index e14766f81..bec2b9fb5 100644
--- a/include/onemkl/blas/detail/blas_loader.hpp
+++ b/include/onemkl/blas/detail/blas_loader.hpp
@@ -31,6 +31,8 @@ namespace onemkl {
 namespace blas {
 namespace detail {
 
+// Buffer APIs
+
 ONEMKL_EXPORT void herk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
                         std::int64_t n, std::int64_t k, float alpha,
                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, float beta,
@@ -91,42 +93,6 @@ ONEMKL_EXPORT void spr(char *libname, cl::sycl::queue &queue, uplo upper_lower,
                        double alpha, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
                        cl::sycl::buffer<double, 1> &a);
 
-ONEMKL_EXPORT void gemm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-    cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size);
-ONEMKL_EXPORT void gemm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-ONEMKL_EXPORT void gemm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-ONEMKL_EXPORT void gemm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
 ONEMKL_EXPORT void gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa,
                               transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
                               float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -757,38 +723,6 @@ ONEMKL_EXPORT void spr2(char *libname, cl::sycl::queue &queue, uplo upper_lower,
                         cl::sycl::buffer<double, 1> &y, std::int64_t incy,
                         cl::sycl::buffer<double, 1> &a);
 
-ONEMKL_EXPORT void trsm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-    cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size);
-ONEMKL_EXPORT void trsm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-ONEMKL_EXPORT void trsm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-ONEMKL_EXPORT void trsm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
 ONEMKL_EXPORT void trsm_batch(char *libname, cl::sycl::queue &queue, side left_right,
                               uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
                               std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
@@ -868,6 +802,797 @@ ONEMKL_EXPORT void rotg(char *libname, cl::sycl::queue &queue,
                         cl::sycl::buffer<std::complex<double>, 1> &b,
                         cl::sycl::buffer<double, 1> &c,
                         cl::sycl::buffer<std::complex<double>, 1> &s);
+
+// USM APIs
+
+ONEMKL_EXPORT cl::sycl::event herk(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, float alpha, const std::complex<float> *a, std::int64_t lda, float beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event herk(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event scal(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event scal(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+    std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event scal(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event scal(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event scal(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
+                                  float *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
+                                  double *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda,
+    const float **b, std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda,
+    const double **b, std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
+    std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
+    std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
+    std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+    std::int64_t stride_a, const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
+    float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
+    std::int64_t stride_a, const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
+    double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb,
+    std::int64_t stride_b, std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb,
+    std::int64_t stride_b, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta, float *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syrk(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta, double *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syrk(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syrk(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event her2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event hbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                                  std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
+                                  std::int64_t incy, float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                                  std::complex<double> *x, std::int64_t incx,
+                                  std::complex<double> *y, std::int64_t incy, double c, double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x,
+                                  std::int64_t incx, float *y, std::int64_t incy, float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x,
+                                  std::int64_t incx, double *y, std::int64_t incy, double c,
+                                  double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event axpy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event axpy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event axpy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    char *libname, cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x,
+    std::int64_t *incx, float **y, std::int64_t *incy, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    char *libname, cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
+    std::int64_t *incx, double **y, std::int64_t *incy, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    char *libname, cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    char *libname, cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gerc(
+    char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gerc(
+    char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    double alpha, const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+    double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, float alpha, const std::complex<float> *x,
+                                  std::int64_t incx, std::complex<float> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, double alpha, const std::complex<double> *x,
+                                  std::int64_t incx, std::complex<double> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, float alpha, const std::complex<float> *x,
+                                  std::int64_t incx, std::complex<float> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, double alpha, const std::complex<double> *x,
+                                  std::int64_t incx, std::complex<double> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event iamin(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event iamin(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+    std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event iamin(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+    std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event hpmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const float *a, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event spmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const double *a, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotmg(
+    char *libname, cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rotmg(
+    char *libname, cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1,
+    double *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event swap(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event swap(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event swap(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+    std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event geru(
+    char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event geru(
+    char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+    std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+    std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+    std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b,
+    std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+    std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
+    const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+    std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+    std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b,
+    std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
+    const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gemm(
+    char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syr2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m,
+                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
+                                  const float *y, std::int64_t incy, float *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m,
+                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
+                                  const double *y, std::int64_t incy, double *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
+    float *b, std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trsm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
+    double *b, std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trsm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trsm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotu(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+    std::int64_t incx, const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event dotu(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+    std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+    std::complex<double> *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event hemm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event hpr2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, double alpha, const double *a, std::int64_t lda,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+    std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event symm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+    std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event symm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event symm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotc(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+    std::int64_t incx, const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event dotc(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+    std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+    std::complex<double> *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
+                                  float *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower,
+                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
+                                  double *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
+    float *b, std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trmm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
+    double *b, std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trmm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trmm(
+    char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+    diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event symv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event trsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event copy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event copy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event copy(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event hemv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event iamax(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event iamax(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+    std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event iamax(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+    std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event sbmv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+    double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+    std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event asum(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+    std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event asum(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event asum(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t n, std::int64_t k, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event spr2(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotm(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rotm(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                                  const float *x, std::int64_t incx, const float *y,
+                                  std::int64_t incy, float *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                                  const double *x, std::int64_t incx, const double *y,
+                                  std::int64_t incy, double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                                  const float *x, std::int64_t incx, const float *y,
+                                  std::int64_t incy, double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sdsdot(
+    char *libname, cl::sycl::queue &queue, std::int64_t n, float sb, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2k(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event her2k(
+    char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    char *libname, cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rotg(
+    char *libname, cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rotg(
+    char *libname, cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+ONEMKL_EXPORT cl::sycl::event rotg(
+    char *libname, cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b,
+    double *c, std::complex<double> *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
 } //namespace detail
 } //namespace blas
 } //namespace onemkl
diff --git a/include/onemkl/blas/detail/cublas/blas_ct.hpp b/include/onemkl/blas/detail/cublas/blas_ct.hpp
index 4ac19b7f8..a18b167ce 100644
--- a/include/onemkl/blas/detail/cublas/blas_ct.hpp
+++ b/include/onemkl/blas/detail/cublas/blas_ct.hpp
@@ -33,14 +33,13 @@
 
 #include "onemkl_blas_cublas.hpp"
 
+#include "onemkl/blas/detail/blas_ct_templates.hpp"
+
 namespace onemkl {
 namespace blas {
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+// Buffer APIs
+
 template <>
 void syr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -52,11 +51,6 @@ void syr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void syr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -68,9 +62,6 @@ void syr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx) {
@@ -79,9 +70,6 @@ void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n, double alpha,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx) {
@@ -90,9 +78,6 @@ void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                std::complex<float> alpha,
@@ -103,9 +88,6 @@ void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                std::complex<double> alpha,
@@ -116,9 +98,6 @@ void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -128,9 +107,6 @@ void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n, double alpha,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -140,10 +116,6 @@ void scal<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -154,10 +126,6 @@ void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -168,11 +136,6 @@ void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -185,11 +148,6 @@ void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -202,10 +160,6 @@ void trmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -216,10 +170,6 @@ void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -230,10 +180,6 @@ void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -245,10 +191,6 @@ void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -260,10 +202,6 @@ void tpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &a);
 template <>
 void spr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, float alpha,
@@ -274,10 +212,6 @@ void spr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &a);
 template <>
 void spr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, double alpha,
@@ -288,12 +222,6 @@ void spr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
 template <>
 void hpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, std::complex<float> alpha,
@@ -307,12 +235,6 @@ void hpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
 template <>
 void hpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, std::complex<double> alpha,
@@ -326,11 +248,6 @@ void hpmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, float beta, cl::sycl::buffer<float, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, std::int64_t n, std::int64_t k,
@@ -342,11 +259,6 @@ void syrk<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, double beta, cl::sycl::buffer<double, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, std::int64_t n, std::int64_t k,
@@ -358,12 +270,6 @@ void syrk<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -374,12 +280,6 @@ void syrk<library::cublas, backend::nvidiagpu>(
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -390,12 +290,6 @@ void syrk<library::cublas, backend::nvidiagpu>(
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void her2<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
@@ -407,12 +301,6 @@ void her2<library::cublas, backend::nvidiagpu>(
     her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void her2<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
@@ -424,12 +312,6 @@ void her2<library::cublas, backend::nvidiagpu>(
     her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void hbmv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
@@ -441,12 +323,6 @@ void hbmv<library::cublas, backend::nvidiagpu>(
     hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void hbmv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
@@ -458,11 +334,6 @@ void hbmv<library::cublas, backend::nvidiagpu>(
     hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                       float s);
 template <>
 void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                               cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -474,11 +345,6 @@ void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-                       double s);
 template <>
 void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                               cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -490,10 +356,6 @@ void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy, float c,
-                       float s);
 template <>
 void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                               cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -504,10 +366,6 @@ void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       double c, double s);
 template <>
 void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                               cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -518,10 +376,6 @@ void rot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -531,10 +385,6 @@ void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n, double alpha,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -544,10 +394,6 @@ void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                std::complex<float> alpha,
@@ -560,10 +406,6 @@ void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                std::complex<double> alpha,
@@ -576,12 +418,6 @@ void axpy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void gerc<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
@@ -593,12 +429,6 @@ void gerc<library::cublas, backend::nvidiagpu>(
     gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void gerc<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
@@ -610,11 +440,6 @@ void gerc<library::cublas, backend::nvidiagpu>(
     gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                         float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void syr2k<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, std::int64_t n, std::int64_t k,
@@ -627,11 +452,6 @@ void syr2k<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upp
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
-                         double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void syr2k<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, std::int64_t n, std::int64_t k,
@@ -644,13 +464,6 @@ void syr2k<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upp
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                         std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void syr2k<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -662,13 +475,6 @@ void syr2k<library::cublas, backend::nvidiagpu>(
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void syr2k<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -680,11 +486,6 @@ void syr2k<library::cublas, backend::nvidiagpu>(
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose trans,
                                                std::int64_t m, std::int64_t n, float alpha,
@@ -697,11 +498,6 @@ void gemv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose trans,
                                                std::int64_t m, std::int64_t n, double alpha,
@@ -714,12 +510,6 @@ void gemv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
@@ -731,12 +521,6 @@ void gemv<library::cublas, backend::nvidiagpu>(
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
@@ -748,10 +532,6 @@ void gemv<library::cublas, backend::nvidiagpu>(
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
 template <>
 void her<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, float alpha,
@@ -764,10 +544,6 @@ void her<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
 template <>
 void her<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, double alpha,
@@ -780,10 +556,6 @@ void her<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &a);
 template <>
 void hpr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, float alpha,
@@ -795,10 +567,6 @@ void hpr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &a);
 template <>
 void hpr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, double alpha,
@@ -810,9 +578,6 @@ void hpr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -822,9 +587,6 @@ void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -834,10 +596,6 @@ void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -848,10 +606,6 @@ void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -862,128 +616,6 @@ void iamin<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                              cl::sycl::buffer<transpose, 1> &transb,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<std::int64_t, 1> &k,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb,
-                              cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                              cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-    cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                              cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              cl::sycl::buffer<float, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -999,14 +631,6 @@ void gemm_batch<library::cublas, backend::nvidiagpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                              cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, double beta,
-                              cl::sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1022,15 +646,6 @@ void gemm_batch<library::cublas, backend::nvidiagpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                              cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1047,15 +662,6 @@ void gemm_batch<library::cublas, backend::nvidiagpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                              cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1072,11 +678,6 @@ void gemm_batch<library::cublas, backend::nvidiagpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y,
-                        std::int64_t incy);
 template <>
 void spmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -1089,11 +690,6 @@ void spmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y,
-                        std::int64_t incy);
 template <>
 void spmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -1106,12 +702,6 @@ void spmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, float beta,
-                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose transa,
                                                    transpose transb, std::int64_t m, std::int64_t n,
@@ -1125,13 +715,6 @@ void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, trans
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                            float alpha, cl::sycl::buffer<int8_t, 1> &a, std::int64_t lda,
-                            int8_t ao, cl::sycl::buffer<uint8_t, 1> &b, std::int64_t ldb,
-                            uint8_t bo, float beta, cl::sycl::buffer<int32_t, 1> &c,
-                            std::int64_t ldc, cl::sycl::buffer<int32_t, 1> &co);
 template <>
 void gemm_ext<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc, std::int64_t m,
@@ -1146,12 +729,6 @@ void gemm_ext<library::cublas, backend::nvidiagpu>(
                            beta, c, ldc, co);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                            cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose transa,
                                                    transpose transb, std::int64_t m, std::int64_t n,
@@ -1165,12 +742,6 @@ void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, trans
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                            cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                            cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose transa,
                                                    transpose transb, std::int64_t m, std::int64_t n,
@@ -1184,13 +755,6 @@ void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, trans
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k,
-                            std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                            std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-                            std::int64_t ldb, std::complex<float> beta,
-                            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1202,14 +766,6 @@ void gemm_ext<library::cublas, backend::nvidiagpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k,
-                            std::complex<double> alpha,
-                            cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                            std::int64_t ldc);
 template <>
 void gemm_ext<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1221,12 +777,6 @@ void gemm_ext<library::cublas, backend::nvidiagpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, half alpha,
-                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
-                            cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose transa,
                                                    transpose transb, std::int64_t m, std::int64_t n,
@@ -1240,9 +790,6 @@ void gemm_ext<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, trans
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -1252,9 +799,6 @@ void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -1264,10 +808,6 @@ void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1279,10 +819,6 @@ void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1294,12 +830,6 @@ void swap<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void geru<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
@@ -1311,12 +841,6 @@ void geru<library::cublas, backend::nvidiagpu>(
     geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void geru<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
@@ -1328,10 +852,6 @@ void geru<library::cublas, backend::nvidiagpu>(
     geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &result);
 template <>
 void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1342,10 +862,6 @@ void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &result);
 template <>
 void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1356,9 +872,6 @@ void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
 template <>
 void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -1368,9 +881,6 @@ void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
 template <>
 void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -1380,11 +890,6 @@ void nrm2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose transa,
                                                transpose transb, std::int64_t m, std::int64_t n,
@@ -1398,12 +903,6 @@ void gemm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose transa,
                                                transpose transb, std::int64_t m, std::int64_t n,
@@ -1417,13 +916,6 @@ void gemm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void gemm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1435,13 +927,6 @@ void gemm<library::cublas, backend::nvidiagpu>(
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void gemm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1453,11 +938,6 @@ void gemm<library::cublas, backend::nvidiagpu>(
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer<half, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
-                        cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose transa,
                                                transpose transb, std::int64_t m, std::int64_t n,
@@ -1471,11 +951,6 @@ void gemm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, float beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void herk<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -1486,11 +961,6 @@ void herk<library::cublas, backend::nvidiagpu>(
     herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void herk<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -1501,11 +971,6 @@ void herk<library::cublas, backend::nvidiagpu>(
     herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
 template <>
 void ger<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t m,
                                               std::int64_t n, float alpha,
@@ -1517,11 +982,6 @@ void ger<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void ger<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t m,
                                               std::int64_t n, double alpha,
@@ -1533,11 +993,6 @@ void ger<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left_right,
                                                uplo upper_lower, transpose trans, diag unit_diag,
@@ -1552,11 +1007,6 @@ void trsm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left_right,
                                                uplo upper_lower, transpose trans, diag unit_diag,
@@ -1571,11 +1021,6 @@ void trsm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -1590,11 +1035,6 @@ void trsm<library::cublas, backend::nvidiagpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -1609,11 +1049,6 @@ void trsm<library::cublas, backend::nvidiagpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<float>, 1> &result);
 template <>
 void dotu<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1626,11 +1061,6 @@ void dotu<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     dotu_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<double>, 1> &result);
 template <>
 void dotu<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1643,13 +1073,6 @@ void dotu<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     dotu_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void hemm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1661,13 +1084,6 @@ void hemm<library::cublas, backend::nvidiagpu>(
     hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void hemm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1679,11 +1095,6 @@ void hemm<library::cublas, backend::nvidiagpu>(
     hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a);
 template <>
 void hpr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, std::complex<float> alpha,
@@ -1697,11 +1108,6 @@ void hpr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a);
 template <>
 void hpr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, std::complex<double> alpha,
@@ -1715,12 +1121,6 @@ void hpr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void gbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose trans,
                                                std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1734,12 +1134,6 @@ void gbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void gbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose trans,
                                                std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1753,13 +1147,6 @@ void gbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, transpose
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
 template <>
 void gbmv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1771,13 +1158,6 @@ void gbmv<library::cublas, backend::nvidiagpu>(
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
 template <>
 void gbmv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1789,10 +1169,6 @@ void gbmv<library::cublas, backend::nvidiagpu>(
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -1804,10 +1180,6 @@ void tbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -1819,11 +1191,6 @@ void tbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void tbmv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -1834,11 +1201,6 @@ void tbmv<library::cublas, backend::nvidiagpu>(
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -1849,11 +1211,6 @@ void tbmv<library::cublas, backend::nvidiagpu>(
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void symm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left_right,
                                                uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1866,11 +1223,6 @@ void symm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
-                        double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void symm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left_right,
                                                uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1883,13 +1235,6 @@ void symm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void symm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1901,13 +1246,6 @@ void symm<library::cublas, backend::nvidiagpu>(
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void symm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1919,11 +1257,6 @@ void symm<library::cublas, backend::nvidiagpu>(
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<float>, 1> &result);
 template <>
 void dotc<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1936,11 +1269,6 @@ void dotc<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     dotc_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<double>, 1> &result);
 template <>
 void dotc<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1953,10 +1281,6 @@ void dotc<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     dotc_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
 template <>
 void syr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, float alpha,
@@ -1967,10 +1291,6 @@ void syr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void syr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                               std::int64_t n, double alpha,
@@ -1981,11 +1301,6 @@ void syr<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper
     syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left_right,
                                                uplo upper_lower, transpose trans, diag unit_diag,
@@ -2000,11 +1315,6 @@ void trmm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left_right,
                                                uplo upper_lower, transpose trans, diag unit_diag,
@@ -2019,11 +1329,6 @@ void trmm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, side left
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2038,11 +1343,6 @@ void trmm<library::cublas, backend::nvidiagpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2057,10 +1357,6 @@ void trmm<library::cublas, backend::nvidiagpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &d1,
-                         cl::sycl::buffer<float, 1> &d2, cl::sycl::buffer<float, 1> &x1, float y1,
-                         cl::sycl::buffer<float, 1> &param);
 template <>
 void rotmg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<float, 1> &d1,
@@ -2072,10 +1368,6 @@ void rotmg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
     rotmg_postcondition(queue, d1, d2, x1, y1, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &d1,
-                         cl::sycl::buffer<double, 1> &d2, cl::sycl::buffer<double, 1> &x1,
-                         double y1, cl::sycl::buffer<double, 1> &param);
 template <>
 void rotmg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<double, 1> &d1,
@@ -2087,10 +1379,6 @@ void rotmg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
     rotmg_postcondition(queue, d1, d2, x1, y1, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2101,10 +1389,6 @@ void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2115,10 +1399,6 @@ void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2130,10 +1410,6 @@ void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2145,10 +1421,6 @@ void tpsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2159,10 +1431,6 @@ void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2173,11 +1441,6 @@ void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2190,11 +1453,6 @@ void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2207,9 +1465,6 @@ void trsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2219,9 +1474,6 @@ void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2231,10 +1483,6 @@ void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2246,10 +1494,6 @@ void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2261,12 +1505,6 @@ void copy<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void hemv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
@@ -2278,12 +1516,6 @@ void hemv<library::cublas, backend::nvidiagpu>(
     hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void hemv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
@@ -2295,12 +1527,6 @@ void hemv<library::cublas, backend::nvidiagpu>(
     hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, float alpha,
-                         cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose transa, transpose transb, std::int64_t n,
@@ -2317,12 +1543,6 @@ void gemmt<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upp
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, double alpha,
-                         cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose transa, transpose transb, std::int64_t n,
@@ -2339,13 +1559,6 @@ void gemmt<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upp
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, std::complex<float> beta,
-                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
@@ -2360,13 +1573,6 @@ void gemmt<library::cublas, backend::nvidiagpu>(
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
@@ -2381,11 +1587,6 @@ void gemmt<library::cublas, backend::nvidiagpu>(
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void sbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, std::int64_t k, float alpha,
@@ -2398,11 +1599,6 @@ void sbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void sbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, std::int64_t k, double alpha,
@@ -2415,10 +1611,6 @@ void sbmv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &result);
 template <>
 void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2429,10 +1621,6 @@ void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &result);
 template <>
 void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2443,9 +1631,6 @@ void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
 template <>
 void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2455,9 +1640,6 @@ void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
 template <>
 void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2467,10 +1649,6 @@ void asum<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2482,10 +1660,6 @@ void tbsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                transpose trans, diag unit_diag, std::int64_t n,
@@ -2497,11 +1671,6 @@ void tbsv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void tbsv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -2512,11 +1681,6 @@ void tbsv<library::cublas, backend::nvidiagpu>(
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -2527,11 +1691,6 @@ void tbsv<library::cublas, backend::nvidiagpu>(
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &a);
 template <>
 void spr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -2543,11 +1702,6 @@ void spr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &a);
 template <>
 void spr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -2559,9 +1713,6 @@ void spr2<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2571,9 +1722,6 @@ void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2583,10 +1731,6 @@ void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2597,10 +1741,6 @@ void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2611,118 +1751,6 @@ void iamax<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                              cl::sycl::buffer<uplo, 1> &upper_lower,
-                              cl::sycl::buffer<transpose, 1> &trans,
-                              cl::sycl::buffer<diag, 1> &unit_diag,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-    cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::cublas, backend::nvidiagpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2737,12 +1765,6 @@ void trsm_batch<library::cublas, backend::nvidiagpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2757,13 +1779,6 @@ void trsm_batch<library::cublas, backend::nvidiagpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<float> alpha,
-                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2779,13 +1794,6 @@ void trsm_batch<library::cublas, backend::nvidiagpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<double> alpha,
-                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2801,10 +1809,6 @@ void trsm_batch<library::cublas, backend::nvidiagpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &param);
 template <>
 void rotm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2815,10 +1819,6 @@ void rotm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     rotm_postcondition(queue, n, x, incx, y, incy, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &param);
 template <>
 void rotm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2829,10 +1829,6 @@ void rotm<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int6
     rotm_postcondition(queue, n, x, incx, y, incy, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<float, 1> &c,
-                        cl::sycl::buffer<float, 1> &s);
 template <>
 void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
                                                cl::sycl::buffer<float, 1> &a,
@@ -2844,10 +1840,6 @@ void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<double, 1> &c,
-                        cl::sycl::buffer<double, 1> &s);
 template <>
 void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
                                                cl::sycl::buffer<double, 1> &a,
@@ -2859,10 +1851,6 @@ void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<float, 1> &c,
-                        cl::sycl::buffer<std::complex<float>, 1> &s);
 template <>
 void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
                                                cl::sycl::buffer<std::complex<float>, 1> &a,
@@ -2874,11 +1862,6 @@ void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &b,
-                        cl::sycl::buffer<double, 1> &c,
-                        cl::sycl::buffer<std::complex<double>, 1> &s);
 template <>
 void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
                                                cl::sycl::buffer<std::complex<double>, 1> &a,
@@ -2890,11 +1873,6 @@ void rotg<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb,
-                          cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                          cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                          cl::sycl::buffer<float, 1> &result);
 template <>
 void sdsdot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n, float sb,
                                                  cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2905,12 +1883,6 @@ void sdsdot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::in
     sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void her2k<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -2922,13 +1894,6 @@ void her2k<library::cublas, backend::nvidiagpu>(
     her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void her2k<library::cublas, backend::nvidiagpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -2940,10 +1905,6 @@ void her2k<library::cublas, backend::nvidiagpu>(
     her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<float, 1> &result);
 template <>
 void dot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                               cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2954,10 +1915,6 @@ void dot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &result);
 template <>
 void dot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                               cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2968,10 +1925,6 @@ void dot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &result);
 template <>
 void dot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64_t n,
                                               cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2982,11 +1935,6 @@ void dot<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, std::int64
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void symv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -2999,11 +1947,6 @@ void symv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void symv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -3016,6 +1959,2068 @@ void symv<library::cublas, backend::nvidiagpu>(cl::sycl::queue &queue, uplo uppe
     symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
+// USM APIs
+
+template <>
+cl::sycl::event syr2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::cublas::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::cublas::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::cublas::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::cublas::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::cublas::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::cublas::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::cublas::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::cublas::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+    float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
+    double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gerc<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::cublas::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gerc<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::cublas::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::cublas::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::cublas::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::cublas::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::cublas::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b,
+    std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b,
+    std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
+    const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
+    const std::complex<double> **b, std::int64_t *ldb, std::complex<double> *beta,
+    std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+    const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+    const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c,
+    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::cublas::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::cublas::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event geru<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::cublas::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event geru<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::cublas::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event herk<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const std::complex<float> *a, std::int64_t lda, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::cublas::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event herk<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::cublas::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event ger<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::cublas::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event ger<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::cublas::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotu<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::dotu(queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotu<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::dotu(queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::cublas::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::cublas::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x,
+    std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotc<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::dotc(queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotc<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::dotc(queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::cublas::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::cublas::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotmg<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = onemkl::cublas::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotmg<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = onemkl::cublas::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sbmv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::cublas::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::cublas::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr2<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::cublas::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done = onemkl::cublas::rotm(queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotm<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done = onemkl::cublas::rotm(queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
+    std::complex<double> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sdsdot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
+    const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies);
+    sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2k<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::cublas::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2k<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::cublas::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::cublas::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symv<library::cublas, backend::nvidiagpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::cublas::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
 } //namespace blas
 } //namespace onemkl
 
diff --git a/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp b/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp
index 2336bbbd4..91fcfefcb 100644
--- a/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp
+++ b/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp
@@ -31,7 +31,8 @@ using onemkl::side;
 using onemkl::transpose;
 using onemkl::uplo;
 namespace cublas {
-// Level 1
+
+// Buffer APIs
 
 void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<float, 1> &result);
@@ -206,8 +207,6 @@ void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<
 void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 
-// Level 2
-
 void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
           std::int64_t ku, float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
@@ -491,8 +490,6 @@ void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_d
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 
-// Level 3
-
 void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
           std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
           std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
@@ -647,7 +644,6 @@ void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose t
           diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-// Batch API
 
 void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
                 cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
@@ -776,8 +772,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
                 std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 
-// BLAS-like extensions
-
 void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
            std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
            std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
@@ -841,6 +835,843 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::i
               std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
               cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 
+// USM APIs
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x,
+                     std::int64_t incx, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x,
+                     std::int64_t incx, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x,
+                           std::int64_t *incx, float **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
+                           std::int64_t *incx, double **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+                           const std::complex<float> **x, std::int64_t *incx,
+                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+                           const std::complex<double> **x, std::int64_t *incx,
+                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, float *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                    const double *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                    std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                    std::int64_t incx, std::complex<double> *y, std::int64_t incy, double c,
+                    double s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                    std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+                    std::int64_t incy, double c, double s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b,
+                     float *c, std::complex<float> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b,
+                     double *c, std::complex<double> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy, float *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy, double *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
+                      float *param,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1,
+                      double *param,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x,
+                       std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                       const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     std::int64_t kl, std::int64_t ku, float alpha, const float *a,
+                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     std::int64_t kl, std::int64_t ku, double alpha, const double *a,
+                     std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     double alpha, const double *a, std::int64_t lda, const double *x,
+                     std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                    std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                    double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                     float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+                     double alpha, const double *a, std::int64_t lda, const double *x,
+                     std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                     const float *a, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                     const double *a, const double *x, std::int64_t incx, double beta, double *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                     const float *a, std::int64_t lda, const float *x, std::int64_t incx,
+                     float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, std::int64_t k, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const double *a, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const double *a, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const double *a, std::int64_t lda, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<double> *a, std::int64_t lda,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const double *a, std::int64_t lda, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     std::int64_t n, const std::complex<double> *a, std::int64_t lda,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                     std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
+                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+                     std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                     std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                     std::int64_t k, float alpha, const std::complex<float> *a, std::int64_t lda,
+                     float beta, std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                     std::int64_t k, double alpha, const std::complex<double> *a, std::int64_t lda,
+                     double beta, std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                      std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                      std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, float beta,
+                      std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                      std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                      std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                      double beta, std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b,
+                     std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
+                     const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+                     std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                     std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                     std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta,
+                     float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                     std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta,
+                     double *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                      std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b,
+                      std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                      std::int64_t k, double alpha, const double *a, std::int64_t lda,
+                      const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                      std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                      std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+                      std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
+                      std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                      std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                      std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a,
+                     std::int64_t lda, float *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a,
+                     std::int64_t lda, double *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a,
+                     std::int64_t lda, float *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a,
+                     std::int64_t lda, double *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha,
+                           const float **a, std::int64_t *lda, const float **b, std::int64_t *ldb,
+                           float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha,
+                           const double **a, std::int64_t *lda, const double **b, std::int64_t *ldb,
+                           double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           std::complex<float> *alpha, const std::complex<float> **a,
+                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
+                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           std::complex<double> *alpha, const std::complex<double> **a,
+                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
+                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                           const float *a, std::int64_t lda, std::int64_t stride_a, const float *b,
+                           std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
+                           std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                           const double *a, std::int64_t lda, std::int64_t stride_a,
+                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
+                           double *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<float> alpha, const std::complex<float> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
+                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<double> alpha, const std::complex<double> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
+                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, double alpha, const double *a,
+                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
 } // namespace cublas
 } // namespace onemkl
 
diff --git a/include/onemkl/blas/detail/mklcpu/blas_ct.hpp b/include/onemkl/blas/detail/mklcpu/blas_ct.hpp
index 663f3d989..d816e2af6 100644
--- a/include/onemkl/blas/detail/mklcpu/blas_ct.hpp
+++ b/include/onemkl/blas/detail/mklcpu/blas_ct.hpp
@@ -33,14 +33,13 @@
 
 #include "onemkl_blas_mklcpu.hpp"
 
+#include "onemkl/blas/detail/blas_ct_templates.hpp"
+
 namespace onemkl {
 namespace blas {
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+// Buffer APIs
+
 template <>
 void syr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -52,11 +51,6 @@ void syr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void syr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -68,9 +62,6 @@ void syr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx) {
@@ -79,9 +70,6 @@ void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 double alpha, cl::sycl::buffer<double, 1> &x,
@@ -91,9 +79,6 @@ void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<float> alpha,
@@ -104,9 +89,6 @@ void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<double> alpha,
@@ -117,9 +99,6 @@ void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -129,9 +108,6 @@ void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 double alpha,
@@ -142,10 +118,6 @@ void scal<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -156,10 +128,6 @@ void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -170,11 +138,6 @@ void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -187,11 +150,6 @@ void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -204,10 +162,6 @@ void trmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -218,10 +172,6 @@ void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -232,10 +182,6 @@ void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -247,10 +193,6 @@ void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -262,10 +204,6 @@ void tpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &a);
 template <>
 void spr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -276,10 +214,6 @@ void spr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &a);
 template <>
 void spr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -290,12 +224,6 @@ void spr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
 template <>
 void hpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<float> alpha,
@@ -309,12 +237,6 @@ void hpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
 template <>
 void hpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<double> alpha,
@@ -328,11 +250,6 @@ void hpmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, float beta, cl::sycl::buffer<float, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, std::int64_t n, std::int64_t k,
@@ -344,11 +261,6 @@ void syrk<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, double beta, cl::sycl::buffer<double, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, std::int64_t n, std::int64_t k,
@@ -360,12 +272,6 @@ void syrk<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -376,12 +282,6 @@ void syrk<library::intelmkl, backend::intelcpu>(
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -392,12 +292,6 @@ void syrk<library::intelmkl, backend::intelcpu>(
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void her2<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
@@ -409,12 +303,6 @@ void her2<library::intelmkl, backend::intelcpu>(
     her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void her2<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
@@ -426,12 +314,6 @@ void her2<library::intelmkl, backend::intelcpu>(
     her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void hbmv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
@@ -443,12 +325,6 @@ void hbmv<library::intelmkl, backend::intelcpu>(
     hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void hbmv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
@@ -460,11 +336,6 @@ void hbmv<library::intelmkl, backend::intelcpu>(
     hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                       float s);
 template <>
 void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -476,11 +347,6 @@ void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-                       double s);
 template <>
 void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -492,10 +358,6 @@ void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy, float c,
-                       float s);
 template <>
 void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -506,10 +368,6 @@ void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       double c, double s);
 template <>
 void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -520,10 +378,6 @@ void rot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -533,10 +387,6 @@ void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 double alpha, cl::sycl::buffer<double, 1> &x,
@@ -547,10 +397,6 @@ void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<float> alpha,
@@ -563,10 +409,6 @@ void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<double> alpha,
@@ -579,12 +421,6 @@ void axpy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void gerc<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
@@ -596,12 +432,6 @@ void gerc<library::intelmkl, backend::intelcpu>(
     gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void gerc<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
@@ -613,11 +443,6 @@ void gerc<library::intelmkl, backend::intelcpu>(
     gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                         float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose trans, std::int64_t n, std::int64_t k,
@@ -630,11 +455,6 @@ void syr2k<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo up
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
-                         double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose trans, std::int64_t n, std::int64_t k,
@@ -647,13 +467,6 @@ void syr2k<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo up
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                         std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -665,13 +478,6 @@ void syr2k<library::intelmkl, backend::intelcpu>(
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -683,11 +489,6 @@ void syr2k<library::intelmkl, backend::intelcpu>(
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, float alpha,
@@ -700,11 +501,6 @@ void gemv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpos
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, double alpha,
@@ -717,12 +513,6 @@ void gemv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpos
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
@@ -734,12 +524,6 @@ void gemv<library::intelmkl, backend::intelcpu>(
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
@@ -751,10 +535,6 @@ void gemv<library::intelmkl, backend::intelcpu>(
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
 template <>
 void her<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -767,10 +547,6 @@ void her<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
 template <>
 void her<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -783,10 +559,6 @@ void her<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &a);
 template <>
 void hpr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -798,10 +570,6 @@ void hpr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &a);
 template <>
 void hpr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -813,9 +581,6 @@ void hpr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -825,9 +590,6 @@ void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -837,10 +599,6 @@ void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -851,10 +609,6 @@ void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -865,128 +619,6 @@ void iamin<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                              cl::sycl::buffer<transpose, 1> &transb,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<std::int64_t, 1> &k,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb,
-                              cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                              cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-    cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                              cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              cl::sycl::buffer<float, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1002,14 +634,6 @@ void gemm_batch<library::intelmkl, backend::intelcpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                              cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, double beta,
-                              cl::sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1025,15 +649,6 @@ void gemm_batch<library::intelmkl, backend::intelcpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                              cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1050,15 +665,6 @@ void gemm_batch<library::intelmkl, backend::intelcpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                              cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1075,11 +681,6 @@ void gemm_batch<library::intelmkl, backend::intelcpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y,
-                        std::int64_t incy);
 template <>
 void spmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -1092,11 +693,6 @@ void spmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y,
-                        std::int64_t incy);
 template <>
 void spmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -1109,12 +705,6 @@ void spmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, float beta,
-                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose transa,
                                                     transpose transb, std::int64_t m,
@@ -1128,13 +718,6 @@ void gemm_ext<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, tran
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                            float alpha, cl::sycl::buffer<int8_t, 1> &a, std::int64_t lda,
-                            int8_t ao, cl::sycl::buffer<uint8_t, 1> &b, std::int64_t ldb,
-                            uint8_t bo, float beta, cl::sycl::buffer<int32_t, 1> &c,
-                            std::int64_t ldc, cl::sycl::buffer<int32_t, 1> &co);
 template <>
 void gemm_ext<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc, std::int64_t m,
@@ -1149,12 +732,6 @@ void gemm_ext<library::intelmkl, backend::intelcpu>(
                            beta, c, ldc, co);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                            cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose transa,
                                                     transpose transb, std::int64_t m,
@@ -1168,12 +745,6 @@ void gemm_ext<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, tran
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                            cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                            cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1185,13 +756,6 @@ void gemm_ext<library::intelmkl, backend::intelcpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k,
-                            std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                            std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-                            std::int64_t ldb, std::complex<float> beta,
-                            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1203,14 +767,6 @@ void gemm_ext<library::intelmkl, backend::intelcpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k,
-                            std::complex<double> alpha,
-                            cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                            std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1222,12 +778,6 @@ void gemm_ext<library::intelmkl, backend::intelcpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, half alpha,
-                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
-                            cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose transa,
                                                     transpose transb, std::int64_t m,
@@ -1241,9 +791,6 @@ void gemm_ext<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, tran
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -1253,9 +800,6 @@ void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -1265,10 +809,6 @@ void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1280,10 +820,6 @@ void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1295,12 +831,6 @@ void swap<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void geru<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
@@ -1312,12 +842,6 @@ void geru<library::intelmkl, backend::intelcpu>(
     geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void geru<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
@@ -1329,10 +853,6 @@ void geru<library::intelmkl, backend::intelcpu>(
     geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1343,10 +863,6 @@ void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1357,9 +873,6 @@ void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -1369,9 +882,6 @@ void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -1381,11 +891,6 @@ void nrm2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose transa,
                                                 transpose transb, std::int64_t m, std::int64_t n,
@@ -1399,12 +904,6 @@ void gemm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpos
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose transa,
                                                 transpose transb, std::int64_t m, std::int64_t n,
@@ -1418,13 +917,6 @@ void gemm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpos
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1436,13 +928,6 @@ void gemm<library::intelmkl, backend::intelcpu>(
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1454,11 +939,6 @@ void gemm<library::intelmkl, backend::intelcpu>(
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer<half, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
-                        cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose transa,
                                                 transpose transb, std::int64_t m, std::int64_t n,
@@ -1472,11 +952,6 @@ void gemm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpos
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, float beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void herk<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -1487,11 +962,6 @@ void herk<library::intelmkl, backend::intelcpu>(
     herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void herk<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -1502,11 +972,6 @@ void herk<library::intelmkl, backend::intelcpu>(
     herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
 template <>
 void ger<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t m,
                                                std::int64_t n, float alpha,
@@ -1518,11 +983,6 @@ void ger<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void ger<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t m,
                                                std::int64_t n, double alpha,
@@ -1534,11 +994,6 @@ void ger<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -1553,11 +1008,6 @@ void trsm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -1572,11 +1022,6 @@ void trsm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -1591,11 +1036,6 @@ void trsm<library::intelmkl, backend::intelcpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -1610,11 +1050,6 @@ void trsm<library::intelmkl, backend::intelcpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<float>, 1> &result);
 template <>
 void dotu<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1627,11 +1062,6 @@ void dotu<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     dotu_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<double>, 1> &result);
 template <>
 void dotu<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1644,13 +1074,6 @@ void dotu<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     dotu_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void hemm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1662,13 +1085,6 @@ void hemm<library::intelmkl, backend::intelcpu>(
     hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void hemm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1680,11 +1096,6 @@ void hemm<library::intelmkl, backend::intelcpu>(
     hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a);
 template <>
 void hpr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<float> alpha,
@@ -1698,11 +1109,6 @@ void hpr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a);
 template <>
 void hpr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<double> alpha,
@@ -1716,12 +1122,6 @@ void hpr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1735,12 +1135,6 @@ void gbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpos
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1754,13 +1148,6 @@ void gbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, transpos
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1772,13 +1159,6 @@ void gbmv<library::intelmkl, backend::intelcpu>(
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1790,10 +1170,6 @@ void gbmv<library::intelmkl, backend::intelcpu>(
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -1805,10 +1181,6 @@ void tbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -1820,11 +1192,6 @@ void tbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -1835,11 +1202,6 @@ void tbmv<library::intelmkl, backend::intelcpu>(
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -1850,11 +1212,6 @@ void tbmv<library::intelmkl, backend::intelcpu>(
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1867,11 +1224,6 @@ void symm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side lef
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
-                        double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1884,13 +1236,6 @@ void symm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side lef
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1902,13 +1247,6 @@ void symm<library::intelmkl, backend::intelcpu>(
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1920,11 +1258,6 @@ void symm<library::intelmkl, backend::intelcpu>(
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<float>, 1> &result);
 template <>
 void dotc<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1937,11 +1270,6 @@ void dotc<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     dotc_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<double>, 1> &result);
 template <>
 void dotc<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1954,10 +1282,6 @@ void dotc<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     dotc_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
 template <>
 void syr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -1968,10 +1292,6 @@ void syr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void syr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -1982,11 +1302,6 @@ void syr<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo uppe
     syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -2001,11 +1316,6 @@ void trmm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -2020,11 +1330,6 @@ void trmm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2039,11 +1344,6 @@ void trmm<library::intelmkl, backend::intelcpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2058,10 +1358,6 @@ void trmm<library::intelmkl, backend::intelcpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &d1,
-                         cl::sycl::buffer<float, 1> &d2, cl::sycl::buffer<float, 1> &x1, float y1,
-                         cl::sycl::buffer<float, 1> &param);
 template <>
 void rotmg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
                                                  cl::sycl::buffer<float, 1> &d1,
@@ -2073,10 +1369,6 @@ void rotmg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
     rotmg_postcondition(queue, d1, d2, x1, y1, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &d1,
-                         cl::sycl::buffer<double, 1> &d2, cl::sycl::buffer<double, 1> &x1,
-                         double y1, cl::sycl::buffer<double, 1> &param);
 template <>
 void rotmg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
                                                  cl::sycl::buffer<double, 1> &d1,
@@ -2088,10 +1380,6 @@ void rotmg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
     rotmg_postcondition(queue, d1, d2, x1, y1, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2102,10 +1390,6 @@ void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2116,10 +1400,6 @@ void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2131,10 +1411,6 @@ void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2146,10 +1422,6 @@ void tpsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2160,10 +1432,6 @@ void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2174,11 +1442,6 @@ void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2191,11 +1454,6 @@ void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2208,9 +1466,6 @@ void trsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2220,9 +1475,6 @@ void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2232,10 +1484,6 @@ void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2247,10 +1495,6 @@ void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2262,12 +1506,6 @@ void copy<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void hemv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
@@ -2279,12 +1517,6 @@ void hemv<library::intelmkl, backend::intelcpu>(
     hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void hemv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
@@ -2296,12 +1528,6 @@ void hemv<library::intelmkl, backend::intelcpu>(
     hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, float alpha,
-                         cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose transa, transpose transb, std::int64_t n,
@@ -2318,12 +1544,6 @@ void gemmt<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo up
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, double alpha,
-                         cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose transa, transpose transb, std::int64_t n,
@@ -2340,13 +1560,6 @@ void gemmt<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo up
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, std::complex<float> beta,
-                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
@@ -2361,13 +1574,6 @@ void gemmt<library::intelmkl, backend::intelcpu>(
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
@@ -2382,11 +1588,6 @@ void gemmt<library::intelmkl, backend::intelcpu>(
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void sbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::int64_t k, float alpha,
@@ -2399,11 +1600,6 @@ void sbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void sbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::int64_t k, double alpha,
@@ -2416,10 +1612,6 @@ void sbmv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2430,10 +1622,6 @@ void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2444,9 +1632,6 @@ void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2456,9 +1641,6 @@ void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2468,10 +1650,6 @@ void asum<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2483,10 +1661,6 @@ void tbsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2498,11 +1672,6 @@ void tbsv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -2513,11 +1682,6 @@ void tbsv<library::intelmkl, backend::intelcpu>(
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -2528,11 +1692,6 @@ void tbsv<library::intelmkl, backend::intelcpu>(
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &a);
 template <>
 void spr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -2544,11 +1703,6 @@ void spr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &a);
 template <>
 void spr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -2560,9 +1714,6 @@ void spr2<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2572,9 +1723,6 @@ void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2584,10 +1732,6 @@ void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2598,10 +1742,6 @@ void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2612,118 +1752,6 @@ void iamax<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                              cl::sycl::buffer<uplo, 1> &upper_lower,
-                              cl::sycl::buffer<transpose, 1> &trans,
-                              cl::sycl::buffer<diag, 1> &unit_diag,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-    cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelcpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2738,12 +1766,6 @@ void trsm_batch<library::intelmkl, backend::intelcpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2758,13 +1780,6 @@ void trsm_batch<library::intelmkl, backend::intelcpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<float> alpha,
-                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2780,13 +1795,6 @@ void trsm_batch<library::intelmkl, backend::intelcpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<double> alpha,
-                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2802,10 +1810,6 @@ void trsm_batch<library::intelmkl, backend::intelcpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &param);
 template <>
 void rotm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2816,10 +1820,6 @@ void rotm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     rotm_postcondition(queue, n, x, incx, y, incy, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &param);
 template <>
 void rotm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2830,10 +1830,6 @@ void rotm<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int
     rotm_postcondition(queue, n, x, incx, y, incy, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<float, 1> &c,
-                        cl::sycl::buffer<float, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<float, 1> &a,
@@ -2845,10 +1841,6 @@ void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<double, 1> &c,
-                        cl::sycl::buffer<double, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<double, 1> &a,
@@ -2860,10 +1852,6 @@ void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<float, 1> &c,
-                        cl::sycl::buffer<std::complex<float>, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<std::complex<float>, 1> &a,
@@ -2875,11 +1863,6 @@ void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &b,
-                        cl::sycl::buffer<double, 1> &c,
-                        cl::sycl::buffer<std::complex<double>, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<std::complex<double>, 1> &a,
@@ -2891,11 +1874,6 @@ void rotg<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb,
-                          cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                          cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                          cl::sycl::buffer<float, 1> &result);
 template <>
 void sdsdot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n, float sb,
                                                   cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2906,12 +1884,6 @@ void sdsdot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::i
     sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void her2k<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -2923,13 +1895,6 @@ void her2k<library::intelmkl, backend::intelcpu>(
     her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void her2k<library::intelmkl, backend::intelcpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -2941,10 +1906,6 @@ void her2k<library::intelmkl, backend::intelcpu>(
     her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<float, 1> &result);
 template <>
 void dot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2955,10 +1916,6 @@ void dot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &result);
 template <>
 void dot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2969,10 +1926,6 @@ void dot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &result);
 template <>
 void dot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2983,11 +1936,6 @@ void dot<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, std::int6
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void symv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -3000,11 +1948,6 @@ void symv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void symv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -3017,6 +1960,2068 @@ void symv<library::intelmkl, backend::intelcpu>(cl::sycl::queue &queue, uplo upp
     symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
+// USM APIs
+
+template <>
+cl::sycl::event syr2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklcpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklcpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklcpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklcpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklcpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklcpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklcpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklcpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+    float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
+    double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gerc<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklcpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gerc<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklcpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklcpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklcpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklcpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklcpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b,
+    std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b,
+    std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
+    const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
+    const std::complex<double> **b, std::int64_t *ldb, std::complex<double> *beta,
+    std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+    const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+    const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c,
+    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklcpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklcpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event geru<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklcpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event geru<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklcpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event herk<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const std::complex<float> *a, std::int64_t lda, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklcpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event herk<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklcpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event ger<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklcpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event ger<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklcpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotu<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::dotu(queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotu<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::dotu(queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklcpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklcpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x,
+    std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotc<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::dotc(queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotc<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::dotc(queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklcpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklcpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotmg<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = onemkl::mklcpu::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotmg<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = onemkl::mklcpu::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sbmv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklcpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklcpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr2<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklcpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done = onemkl::mklcpu::rotm(queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotm<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done = onemkl::mklcpu::rotm(queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
+    std::complex<double> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sdsdot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
+    const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies);
+    sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2k<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklcpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2k<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklcpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklcpu::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symv<library::intelmkl, backend::intelcpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklcpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
 } //namespace blas
 } //namespace onemkl
 
diff --git a/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp b/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp
index a64df93f4..b163cb144 100644
--- a/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp
+++ b/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp
@@ -38,7 +38,7 @@ using onemkl::offset;
 
 namespace mklcpu {
 
-// Level 1
+// Buffer APIs
 
 ONEMKL_EXPORT void asum(cl::sycl::queue &queue, std::int64_t n,
                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
@@ -246,8 +246,6 @@ ONEMKL_EXPORT void swap(cl::sycl::queue &queue, std::int64_t n,
                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
                         cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 
-// Level 2
-
 ONEMKL_EXPORT void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
                         std::int64_t kl, std::int64_t ku, float alpha,
                         cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -574,8 +572,6 @@ ONEMKL_EXPORT void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose tran
                         std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
                         std::int64_t incx);
 
-// Level 3
-
 ONEMKL_EXPORT void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
                         std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
                         std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
@@ -753,50 +749,6 @@ ONEMKL_EXPORT void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lowe
                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 
-// Batch API
-
-ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                              cl::sycl::buffer<transpose, 1> &transb,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<std::int64_t, 1> &k,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb,
-                              cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                              cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
 ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
                               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
                               cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -831,44 +783,6 @@ ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, transpose transa, transpos
                               cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
                               std::int64_t stride_c, std::int64_t batch_size);
 
-ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                              cl::sycl::buffer<uplo, 1> &upper_lower,
-                              cl::sycl::buffer<transpose, 1> &trans,
-                              cl::sycl::buffer<diag, 1> &unit_diag,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
 ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
                               float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -895,8 +809,6 @@ ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, side left_right, uplo uppe
                               std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 
-// BLAS-like extensions
-
 ONEMKL_EXPORT void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
                          transpose transb, std::int64_t n, std::int64_t k, float alpha,
                          cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -969,6 +881,876 @@ ONEMKL_EXPORT void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose
                             cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
                             cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 
+// USM APIs
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+    float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
+    double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                  std::int64_t incx, const float *y, std::int64_t incy,
+                                  float *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                  std::int64_t incx, const double *y, std::int64_t incy,
+                                  double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                  std::int64_t incx, const float *y, std::int64_t incy,
+                                  double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                                  std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                                  float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                                  std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                                  double c, double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x,
+                                  std::int64_t incx, float *y, std::int64_t incy, float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x,
+                                  std::int64_t incx, double *y, std::int64_t incy, double c,
+                                  double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
+    std::complex<double> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotmg(
+    cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotmg(
+    cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sdsdot(
+    cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
+    const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x,
+    std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, const float *y,
+                                  std::int64_t incy, float *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, const double *y,
+                                  std::int64_t incy, double *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const std::complex<float> *x, std::int64_t incx,
+                                  std::complex<float> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const std::complex<double> *x, std::int64_t incx,
+                                  std::complex<double> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const std::complex<float> *x, std::int64_t incx,
+                                  std::complex<float> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const std::complex<double> *x, std::int64_t incx,
+                                  std::complex<double> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sbmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spmv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, float *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, double *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symv(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, float *a,
+                                  std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, double *a,
+                                  std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event herk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const std::complex<float> *a, std::int64_t lda, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event herk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b,
+    std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b,
+    std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
+    const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
+    const std::complex<double> **b, std::int64_t *ldb, std::complex<double> *beta,
+    std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+    const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+    const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c,
+    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
 } //namespace mklcpu
 } //namespace onemkl
 
diff --git a/include/onemkl/blas/detail/mklgpu/blas_ct.hpp b/include/onemkl/blas/detail/mklgpu/blas_ct.hpp
index fdbf50223..32fde5f67 100644
--- a/include/onemkl/blas/detail/mklgpu/blas_ct.hpp
+++ b/include/onemkl/blas/detail/mklgpu/blas_ct.hpp
@@ -33,14 +33,13 @@
 
 #include "onemkl_blas_mklgpu.hpp"
 
+#include "onemkl/blas/detail/blas_ct_templates.hpp"
+
 namespace onemkl {
 namespace blas {
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+// Buffer APIs
+
 template <>
 void syr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -52,11 +51,6 @@ void syr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void syr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -68,9 +62,6 @@ void syr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx) {
@@ -79,9 +70,6 @@ void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 double alpha, cl::sycl::buffer<double, 1> &x,
@@ -91,9 +79,6 @@ void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<float> alpha,
@@ -104,9 +89,6 @@ void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<double> alpha,
@@ -117,9 +99,6 @@ void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -129,9 +108,6 @@ void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 double alpha,
@@ -142,10 +118,6 @@ void scal<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     scal_postcondition(queue, n, alpha, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -156,10 +128,6 @@ void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -170,11 +138,6 @@ void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -187,11 +150,6 @@ void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -204,10 +162,6 @@ void trmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -218,10 +172,6 @@ void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -232,10 +182,6 @@ void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -247,10 +193,6 @@ void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -262,10 +204,6 @@ void tpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &a);
 template <>
 void spr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -276,10 +214,6 @@ void spr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &a);
 template <>
 void spr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -290,12 +224,6 @@ void spr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     spr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
 template <>
 void hpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<float> alpha,
@@ -309,12 +237,6 @@ void hpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
 template <>
 void hpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<double> alpha,
@@ -328,11 +250,6 @@ void hpmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, float beta, cl::sycl::buffer<float, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, std::int64_t n, std::int64_t k,
@@ -344,11 +261,6 @@ void syrk<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, double beta, cl::sycl::buffer<double, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, std::int64_t n, std::int64_t k,
@@ -360,12 +272,6 @@ void syrk<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -376,12 +282,6 @@ void syrk<library::intelmkl, backend::intelgpu>(
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void syrk<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -392,12 +292,6 @@ void syrk<library::intelmkl, backend::intelgpu>(
     syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void her2<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
@@ -409,12 +303,6 @@ void her2<library::intelmkl, backend::intelgpu>(
     her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void her2<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
@@ -426,12 +314,6 @@ void her2<library::intelmkl, backend::intelgpu>(
     her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void hbmv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
@@ -443,12 +325,6 @@ void hbmv<library::intelmkl, backend::intelgpu>(
     hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void hbmv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
@@ -460,11 +336,6 @@ void hbmv<library::intelmkl, backend::intelgpu>(
     hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                       float s);
 template <>
 void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -476,11 +347,6 @@ void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-                       double s);
 template <>
 void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -492,10 +358,6 @@ void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy, float c,
-                       float s);
 template <>
 void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -506,10 +368,6 @@ void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       double c, double s);
 template <>
 void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -520,10 +378,6 @@ void rot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     rot_postcondition(queue, n, x, incx, y, incy, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n, float alpha,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -533,10 +387,6 @@ void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 double alpha, cl::sycl::buffer<double, 1> &x,
@@ -547,10 +397,6 @@ void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<float> alpha,
@@ -563,10 +409,6 @@ void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 std::complex<double> alpha,
@@ -579,12 +421,6 @@ void axpy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     axpy_postcondition(queue, n, alpha, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void gerc<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
@@ -596,12 +432,6 @@ void gerc<library::intelmkl, backend::intelgpu>(
     gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void gerc<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
@@ -613,11 +443,6 @@ void gerc<library::intelmkl, backend::intelgpu>(
     gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                         float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose trans, std::int64_t n, std::int64_t k,
@@ -630,11 +455,6 @@ void syr2k<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo up
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
-                         double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose trans, std::int64_t n, std::int64_t k,
@@ -647,13 +467,6 @@ void syr2k<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo up
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                         std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -665,13 +478,6 @@ void syr2k<library::intelmkl, backend::intelgpu>(
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void syr2k<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -683,11 +489,6 @@ void syr2k<library::intelmkl, backend::intelgpu>(
     syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, float alpha,
@@ -700,11 +501,6 @@ void gemv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpos
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, double alpha,
@@ -717,12 +513,6 @@ void gemv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpos
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
@@ -734,12 +524,6 @@ void gemv<library::intelmkl, backend::intelgpu>(
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void gemv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
@@ -751,10 +535,6 @@ void gemv<library::intelmkl, backend::intelgpu>(
     gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
 template <>
 void her<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -767,10 +547,6 @@ void her<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
 template <>
 void her<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -783,10 +559,6 @@ void her<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<float>, 1> &a);
 template <>
 void hpr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -798,10 +570,6 @@ void hpr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<std::complex<double>, 1> &a);
 template <>
 void hpr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -813,9 +581,6 @@ void hpr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -825,9 +590,6 @@ void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -837,10 +599,6 @@ void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -851,10 +609,6 @@ void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     iamin_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamin(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -864,128 +618,7 @@ void iamin<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     onemkl::mklgpu::iamin(queue, n, x, incx, result);
     iamin_postcondition(queue, n, x, incx, result);
 }
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                              cl::sycl::buffer<transpose, 1> &transb,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<std::int64_t, 1> &k,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb,
-                              cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                              cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-    cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void gemm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            group_count, group_size);
-    onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                               group_count, group_size);
-    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             group_count, group_size);
-}
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                              cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              cl::sycl::buffer<float, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1001,14 +634,6 @@ void gemm_batch<library::intelmkl, backend::intelgpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                              cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, double beta,
-                              cl::sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1024,15 +649,6 @@ void gemm_batch<library::intelmkl, backend::intelgpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                              cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1049,15 +665,6 @@ void gemm_batch<library::intelmkl, backend::intelgpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                              cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
 template <>
 void gemm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1074,11 +681,6 @@ void gemm_batch<library::intelmkl, backend::intelgpu>(
                              stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y,
-                        std::int64_t incy);
 template <>
 void spmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -1091,11 +693,6 @@ void spmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y,
-                        std::int64_t incy);
 template <>
 void spmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -1108,12 +705,6 @@ void spmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, float beta,
-                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose transa,
                                                     transpose transb, std::int64_t m,
@@ -1127,13 +718,6 @@ void gemm_ext<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, tran
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                            float alpha, cl::sycl::buffer<int8_t, 1> &a, std::int64_t lda,
-                            int8_t ao, cl::sycl::buffer<uint8_t, 1> &b, std::int64_t ldb,
-                            uint8_t bo, float beta, cl::sycl::buffer<int32_t, 1> &c,
-                            std::int64_t ldc, cl::sycl::buffer<int32_t, 1> &co);
 template <>
 void gemm_ext<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc, std::int64_t m,
@@ -1148,12 +732,6 @@ void gemm_ext<library::intelmkl, backend::intelgpu>(
                            beta, c, ldc, co);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                            cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose transa,
                                                     transpose transb, std::int64_t m,
@@ -1167,12 +745,6 @@ void gemm_ext<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, tran
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                            cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                            cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1184,13 +756,6 @@ void gemm_ext<library::intelmkl, backend::intelgpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k,
-                            std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                            std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-                            std::int64_t ldb, std::complex<float> beta,
-                            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1202,14 +767,6 @@ void gemm_ext<library::intelmkl, backend::intelgpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k,
-                            std::complex<double> alpha,
-                            cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                            std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1221,12 +778,6 @@ void gemm_ext<library::intelmkl, backend::intelgpu>(
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb,
-                            std::int64_t m, std::int64_t n, std::int64_t k, half alpha,
-                            cl::sycl::buffer<half, 1> &a, std::int64_t lda,
-                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
-                            cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 template <>
 void gemm_ext<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose transa,
                                                     transpose transb, std::int64_t m,
@@ -1240,9 +791,6 @@ void gemm_ext<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, tran
     gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -1252,9 +800,6 @@ void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -1264,10 +809,6 @@ void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1279,10 +820,6 @@ void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void swap(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1294,12 +831,6 @@ void swap<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     swap_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
 template <>
 void geru<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
@@ -1311,12 +842,6 @@ void geru<library::intelmkl, backend::intelgpu>(
     geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
 template <>
 void geru<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
@@ -1328,10 +853,6 @@ void geru<library::intelmkl, backend::intelgpu>(
     geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1342,10 +863,6 @@ void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1356,9 +873,6 @@ void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -1368,9 +882,6 @@ void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
 template <>
 void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -1380,11 +891,6 @@ void nrm2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     nrm2_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose transa,
                                                 transpose transb, std::int64_t m, std::int64_t n,
@@ -1398,12 +904,6 @@ void gemm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpos
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose transa,
                                                 transpose transb, std::int64_t m, std::int64_t n,
@@ -1417,13 +917,6 @@ void gemm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpos
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1435,13 +928,6 @@ void gemm<library::intelmkl, backend::intelgpu>(
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
@@ -1453,11 +939,6 @@ void gemm<library::intelmkl, backend::intelgpu>(
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer<half, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
-                        cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 template <>
 void gemm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose transa,
                                                 transpose transb, std::int64_t m, std::int64_t n,
@@ -1471,11 +952,6 @@ void gemm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpos
     gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, float beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void herk<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -1486,11 +962,6 @@ void herk<library::intelmkl, backend::intelgpu>(
     herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void herk<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -1501,11 +972,6 @@ void herk<library::intelmkl, backend::intelgpu>(
     herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
 template <>
 void ger<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t m,
                                                std::int64_t n, float alpha,
@@ -1517,11 +983,6 @@ void ger<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void ger<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t m,
                                                std::int64_t n, double alpha,
@@ -1533,11 +994,6 @@ void ger<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -1552,11 +1008,6 @@ void trsm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -1571,11 +1022,6 @@ void trsm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -1590,11 +1036,6 @@ void trsm<library::intelmkl, backend::intelgpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 template <>
 void trsm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -1609,11 +1050,6 @@ void trsm<library::intelmkl, backend::intelgpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<float>, 1> &result);
 template <>
 void dotu<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1626,11 +1062,6 @@ void dotu<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     dotu_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotu(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<double>, 1> &result);
 template <>
 void dotu<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1643,13 +1074,6 @@ void dotu<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     dotu_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void hemm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1661,13 +1085,6 @@ void hemm<library::intelmkl, backend::intelgpu>(
     hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void hemm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1679,11 +1096,6 @@ void hemm<library::intelmkl, backend::intelgpu>(
     hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<float>, 1> &a);
 template <>
 void hpr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<float> alpha,
@@ -1697,11 +1109,6 @@ void hpr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, cl::sycl::buffer<std::complex<double>, 1> &a);
 template <>
 void hpr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::complex<double> alpha,
@@ -1715,12 +1122,6 @@ void hpr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1734,12 +1135,6 @@ void gbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpos
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpose trans,
                                                 std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1753,13 +1148,6 @@ void gbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, transpos
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1771,13 +1159,6 @@ void gbmv<library::intelmkl, backend::intelgpu>(
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
 template <>
 void gbmv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
@@ -1789,10 +1170,6 @@ void gbmv<library::intelmkl, backend::intelgpu>(
     gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -1804,10 +1181,6 @@ void tbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -1819,11 +1192,6 @@ void tbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -1834,11 +1202,6 @@ void tbmv<library::intelmkl, backend::intelgpu>(
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tbmv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -1849,11 +1212,6 @@ void tbmv<library::intelmkl, backend::intelgpu>(
     tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1866,11 +1224,6 @@ void symm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side lef
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
-                        double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1883,13 +1236,6 @@ void symm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side lef
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1901,13 +1247,6 @@ void symm<library::intelmkl, backend::intelgpu>(
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
 template <>
 void symm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
@@ -1919,11 +1258,6 @@ void symm<library::intelmkl, backend::intelgpu>(
     symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<float>, 1> &result);
 template <>
 void dotc<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -1936,11 +1270,6 @@ void dotc<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     dotc_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dotc(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<std::complex<double>, 1> &result);
 template <>
 void dotc<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -1953,10 +1282,6 @@ void dotc<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     dotc_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<float, 1> &a, std::int64_t lda);
 template <>
 void syr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, float alpha,
@@ -1967,10 +1292,6 @@ void syr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                       cl::sycl::buffer<double, 1> &a, std::int64_t lda);
 template <>
 void syr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                std::int64_t n, double alpha,
@@ -1981,11 +1302,6 @@ void syr<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo uppe
     syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -2000,11 +1316,6 @@ void trmm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side left_right,
                                                 uplo upper_lower, transpose trans, diag unit_diag,
@@ -2019,11 +1330,6 @@ void trmm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, side lef
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2038,11 +1344,6 @@ void trmm<library::intelmkl, backend::intelgpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 template <>
 void trmm<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2057,10 +1358,6 @@ void trmm<library::intelmkl, backend::intelgpu>(
                        ldb);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &d1,
-                         cl::sycl::buffer<float, 1> &d2, cl::sycl::buffer<float, 1> &x1, float y1,
-                         cl::sycl::buffer<float, 1> &param);
 template <>
 void rotmg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
                                                  cl::sycl::buffer<float, 1> &d1,
@@ -2072,10 +1369,6 @@ void rotmg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
     rotmg_postcondition(queue, d1, d2, x1, y1, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &d1,
-                         cl::sycl::buffer<double, 1> &d2, cl::sycl::buffer<double, 1> &x1,
-                         double y1, cl::sycl::buffer<double, 1> &param);
 template <>
 void rotmg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
                                                  cl::sycl::buffer<double, 1> &d1,
@@ -2087,10 +1380,6 @@ void rotmg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
     rotmg_postcondition(queue, d1, d2, x1, y1, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2101,10 +1390,6 @@ void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2115,10 +1400,6 @@ void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2130,10 +1411,6 @@ void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2145,10 +1422,6 @@ void tpsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2159,10 +1432,6 @@ void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2173,11 +1442,6 @@ void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2190,11 +1454,6 @@ void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
 template <>
 void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2207,9 +1466,6 @@ void trsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2219,9 +1475,6 @@ void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2231,10 +1484,6 @@ void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2246,10 +1495,6 @@ void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void copy(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2261,12 +1506,6 @@ void copy<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     copy_postcondition(queue, n, x, incx, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
 template <>
 void hemv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
@@ -2278,12 +1517,6 @@ void hemv<library::intelmkl, backend::intelgpu>(
     hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 template <>
 void hemv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
@@ -2295,12 +1528,6 @@ void hemv<library::intelmkl, backend::intelgpu>(
     hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, float alpha,
-                         cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose transa, transpose transb, std::int64_t n,
@@ -2317,12 +1544,6 @@ void gemmt<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo up
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, double alpha,
-                         cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                  transpose transa, transpose transb, std::int64_t n,
@@ -2339,13 +1560,6 @@ void gemmt<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo up
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, std::complex<float> beta,
-                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
@@ -2360,13 +1574,6 @@ void gemmt<library::intelmkl, backend::intelgpu>(
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
 template <>
 void gemmt<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
@@ -2381,11 +1588,6 @@ void gemmt<library::intelmkl, backend::intelgpu>(
                         ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void sbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::int64_t k, float alpha,
@@ -2398,11 +1600,6 @@ void sbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void sbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, std::int64_t k, double alpha,
@@ -2415,10 +1612,6 @@ void sbmv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2429,10 +1622,6 @@ void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2443,9 +1632,6 @@ void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2455,9 +1641,6 @@ void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &result);
 template <>
 void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2467,10 +1650,6 @@ void asum<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     asum_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2482,10 +1661,6 @@ void tbsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 transpose trans, diag unit_diag, std::int64_t n,
@@ -2497,11 +1672,6 @@ void tbsv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -2512,11 +1682,6 @@ void tbsv<library::intelmkl, backend::intelgpu>(
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 template <>
 void tbsv<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
@@ -2527,11 +1692,6 @@ void tbsv<library::intelmkl, backend::intelgpu>(
     tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &a);
 template <>
 void spr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -2543,11 +1703,6 @@ void spr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &a);
 template <>
 void spr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -2559,9 +1714,6 @@ void spr2<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2571,9 +1723,6 @@ void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                         std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2583,10 +1732,6 @@ void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<float>, 1> &x,
@@ -2597,10 +1742,6 @@ void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void iamax(cl::sycl::queue &queue, std::int64_t n,
-                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         cl::sycl::buffer<std::int64_t, 1> &result);
 template <>
 void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                  cl::sycl::buffer<std::complex<double>, 1> &x,
@@ -2611,118 +1752,6 @@ void iamax<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::in
     iamax_postcondition(queue, n, x, incx, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                              cl::sycl::buffer<uplo, 1> &upper_lower,
-                              cl::sycl::buffer<transpose, 1> &trans,
-                              cl::sycl::buffer<diag, 1> &unit_diag,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-    cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-template <>
-void trsm_batch<library::intelmkl, backend::intelgpu>(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, group_count, group_size);
-    onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                               lda, b, ldb, group_count, group_size);
-    trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                             b, ldb, group_count, group_size);
-}
-
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2737,12 +1766,6 @@ void trsm_batch<library::intelmkl, backend::intelgpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2757,13 +1780,6 @@ void trsm_batch<library::intelmkl, backend::intelgpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<float> alpha,
-                              cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2779,13 +1795,6 @@ void trsm_batch<library::intelmkl, backend::intelgpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<double> alpha,
-                              cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 template <>
 void trsm_batch<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
@@ -2801,10 +1810,6 @@ void trsm_batch<library::intelmkl, backend::intelgpu>(
                              stride_a, b, ldb, stride_b, batch_size);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<float, 1> &param);
 template <>
 void rotm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2815,10 +1820,6 @@ void rotm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     rotm_postcondition(queue, n, x, incx, y, incy, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                        std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                        cl::sycl::buffer<double, 1> &param);
 template <>
 void rotm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                 cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2829,10 +1830,6 @@ void rotm<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int
     rotm_postcondition(queue, n, x, incx, y, incy, param);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &a,
-                        cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<float, 1> &c,
-                        cl::sycl::buffer<float, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<float, 1> &a,
@@ -2844,10 +1841,6 @@ void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &a,
-                        cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<double, 1> &c,
-                        cl::sycl::buffer<double, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<double, 1> &a,
@@ -2859,10 +1852,6 @@ void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<float>, 1> &a,
-                        cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<float, 1> &c,
-                        cl::sycl::buffer<std::complex<float>, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<std::complex<float>, 1> &a,
@@ -2874,11 +1863,6 @@ void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<double>, 1> &a,
-                        cl::sycl::buffer<std::complex<double>, 1> &b,
-                        cl::sycl::buffer<double, 1> &c,
-                        cl::sycl::buffer<std::complex<double>, 1> &s);
 template <>
 void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
                                                 cl::sycl::buffer<std::complex<double>, 1> &a,
@@ -2890,11 +1874,6 @@ void rotg<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue,
     rotg_postcondition(queue, a, b, c, s);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb,
-                          cl::sycl::buffer<float, 1> &x, std::int64_t incx,
-                          cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                          cl::sycl::buffer<float, 1> &result);
 template <>
 void sdsdot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n, float sb,
                                                   cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2905,12 +1884,6 @@ void sdsdot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::i
     sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-                         cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
 template <>
 void her2k<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -2922,13 +1895,6 @@ void her2k<library::intelmkl, backend::intelgpu>(
     her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc);
 template <>
 void her2k<library::intelmkl, backend::intelgpu>(
     cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
@@ -2940,10 +1906,6 @@ void her2k<library::intelmkl, backend::intelgpu>(
     her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<float, 1> &result);
 template <>
 void dot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2954,10 +1916,6 @@ void dot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &result);
 template <>
 void dot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<double, 1> &x, std::int64_t incx,
@@ -2968,10 +1926,6 @@ void dot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x,
-                       std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
-                       cl::sycl::buffer<double, 1> &result);
 template <>
 void dot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int64_t n,
                                                cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -2982,11 +1936,6 @@ void dot<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, std::int6
     dot_postcondition(queue, n, x, incx, y, incy, result);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        cl::sycl::buffer<float, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        cl::sycl::buffer<float, 1> &y, std::int64_t incy);
 template <>
 void symv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, float alpha,
@@ -2999,11 +1948,6 @@ void symv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-template <onemkl::library lib, onemkl::backend backend>
-static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        cl::sycl::buffer<double, 1> &a, std::int64_t lda,
-                        cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        cl::sycl::buffer<double, 1> &y, std::int64_t incy);
 template <>
 void symv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upper_lower,
                                                 std::int64_t n, double alpha,
@@ -3016,6 +1960,2067 @@ void symv<library::intelmkl, backend::intelgpu>(cl::sycl::queue &queue, uplo upp
     symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
+// USM APIs
+
+template <>
+cl::sycl::event syr2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklgpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklgpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event scal<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    scal_precondition(queue, n, alpha, x, incx, dependencies);
+    auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies);
+    scal_postcondition(queue, n, alpha, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklgpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklgpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklgpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklgpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syrk<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklgpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done =
+        onemkl::mklgpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float c, float s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double c, double s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+    rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+    axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+    float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
+    double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event axpy_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                            dependencies);
+    auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                           group_size, dependencies);
+    axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size,
+                             dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gerc<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklgpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gerc<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklgpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr2k<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklgpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklgpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklgpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    auto done = onemkl::mklgpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamin<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamin_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies);
+    iamin_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b,
+    std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b,
+    std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+    std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
+    const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
+    std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
+    const std::complex<double> **b, std::int64_t *ldb, std::complex<double> *beta,
+    std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                            group_count, group_size, dependencies);
+    auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, group_count, group_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                             group_count, group_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+    const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+    const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c,
+    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm_batch<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                            stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    auto done =
+        onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                                   stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
+                             stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklgpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    auto done =
+        onemkl::mklgpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event swap<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    swap_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies);
+    swap_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event geru<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklgpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event geru<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklgpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event nrm2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    nrm2_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies);
+    nrm2_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+                                     ldc, dependencies);
+    gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event herk<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    float alpha, const std::complex<float> *a, std::int64_t lda, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklgpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event herk<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    auto done = onemkl::mklgpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
+                                     dependencies);
+    herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event ger<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklgpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event ger<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    auto done = onemkl::mklgpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotu<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::dotu(queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotu<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotu_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::dotu(queue, n, x, incx, y, incy, result, dependencies);
+    dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklgpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hpr2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklgpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x,
+    std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
+    std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+    float *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+    double *c, std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                      dependencies);
+    auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
+                                     beta, c, ldc, dependencies);
+    symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotc<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::dotc(queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dotc<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dotc_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::dotc(queue, n, x, incx, y, incy, result, dependencies);
+    dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklgpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event syr<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    auto done = onemkl::mklgpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+    std::int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trmm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
+    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb,
+                      dependencies);
+    auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
+                                     a, lda, b, ldb, dependencies);
+    trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+                       ldb, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotmg<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = onemkl::mklgpu::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotmg<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies);
+    auto done = onemkl::mklgpu::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+    rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tpsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    auto done =
+        onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<float> *a, std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event trsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    const std::complex<double> *a, std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                     dependencies);
+    trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event copy<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    copy_precondition(queue, n, x, incx, y, incy, dependencies);
+    auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies);
+    copy_postcondition(queue, n, x, incx, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event hemv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event gemmt<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                       ldc, dependencies);
+    auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
+                                      ldb, beta, c, ldc, dependencies);
+    gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
+                        ldc, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sbmv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                      dependencies);
+    auto done = onemkl::mklgpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                     incy, dependencies);
+    sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy,
+                       dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event asum<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    asum_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies);
+    asum_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event tbsv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
+    std::int64_t k, const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
+                                     dependencies);
+    tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklgpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event spr2<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    auto done =
+        onemkl::mklgpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event iamax<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    iamax_precondition(queue, n, x, incx, result, dependencies);
+    auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies);
+    iamax_postcondition(queue, n, x, incx, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done = onemkl::mklgpu::rotm(queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotm<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotm_precondition(queue, n, x, incx, y, incy, param, dependencies);
+    auto done = onemkl::mklgpu::rotm(queue, n, x, incx, y, incy, param, dependencies);
+    rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event rotg<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
+    std::complex<double> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    rotg_precondition(queue, a, b, c, s, dependencies);
+    auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies);
+    rotg_postcondition(queue, a, b, c, s, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event sdsdot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
+    const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies);
+    sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2k<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklgpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event her2k<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                       dependencies);
+    auto done = onemkl::mklgpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta,
+                                      c, ldc, dependencies);
+    her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                        dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event dot<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y,
+    std::int64_t incy, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    dot_precondition(queue, n, x, incx, y, incy, result, dependencies);
+    auto done = onemkl::mklgpu::dot(queue, n, x, incx, y, incy, result, dependencies);
+    dot_postcondition(queue, n, x, incx, y, incy, result, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
+template <>
+cl::sycl::event symv<library::intelmkl, backend::intelgpu>(
+    cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    auto done = onemkl::mklgpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy,
+                                     dependencies);
+    symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
+    return done;
+}
+
 } //namespace blas
 } //namespace onemkl
 
diff --git a/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp b/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp
index 67a05eb99..431b0e2dc 100644
--- a/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp
+++ b/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp
@@ -30,7 +30,7 @@
 namespace onemkl {
 namespace mklgpu {
 
-// Level 3
+// Buffer APIs
 
 ONEMKL_EXPORT void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
                         std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
@@ -217,8 +217,6 @@ ONEMKL_EXPORT void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl:
                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
                         cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
 
-// Level 2
-
 ONEMKL_EXPORT void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
                         std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
                         std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
@@ -558,8 +556,6 @@ ONEMKL_EXPORT void trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl
                         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
 
-// Level 1
-
 ONEMKL_EXPORT void dotc(cl::sycl::queue &queue, std::int64_t n,
                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
                         cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
@@ -766,48 +762,6 @@ ONEMKL_EXPORT void swap(cl::sycl::queue &queue, std::int64_t n,
                         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
                         cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
 
-// Batch API
-
-ONEMKL_EXPORT void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-    cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-    cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-    cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-    cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-    cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
 ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
                               onemkl::transpose transb, std::int64_t m, std::int64_t n,
                               std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
@@ -842,44 +796,6 @@ ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
                               cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
                               std::int64_t stride_c, std::int64_t batch_size);
 
-ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                              cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                              cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                              cl::sycl::buffer<onemkl::diag, 1> &unit_diag,
-                              cl::sycl::buffer<std::int64_t, 1> &m,
-                              cl::sycl::buffer<std::int64_t, 1> &n,
-                              cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                              cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                              cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                              cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-    cl::sycl::buffer<onemkl::uplo, 1> &upper_lower, cl::sycl::buffer<onemkl::transpose, 1> &trans,
-    cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-    cl::sycl::buffer<onemkl::uplo, 1> &upper_lower, cl::sycl::buffer<onemkl::transpose, 1> &trans,
-    cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
-ONEMKL_EXPORT void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-    cl::sycl::buffer<onemkl::uplo, 1> &upper_lower, cl::sycl::buffer<onemkl::transpose, 1> &trans,
-    cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-
 ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right,
                               onemkl::uplo upper_lower, onemkl::transpose trans,
                               onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
@@ -910,8 +826,6 @@ ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right,
                               std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
 
-// BLAS like
-
 ONEMKL_EXPORT void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
                          onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha,
                          cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -986,6 +900,902 @@ ONEMKL_EXPORT void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa,
                             std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb,
                             half beta, cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
 
+// USM APIs
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b,
+    std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
+    const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+    std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+    std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+    std::int64_t n, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta, float *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta, double *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syrk(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event herk(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, float alpha, const std::complex<float> *a, std::int64_t lda, float beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event herk(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, double alpha, const std::complex<double> *a, std::int64_t lda, double beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+    float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
+    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+    std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2k(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+    std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2k(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *b, std::int64_t ldb, float beta, std::complex<float> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2k(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *b, std::int64_t ldb, double beta, std::complex<double> *c,
+    std::int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, float *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, double *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, float *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, double *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsm(
+    cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+    onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> *b, std::int64_t ldb,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x,
+    std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, double alpha, const double *a, std::int64_t lda,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gbmv(
+    cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+    std::int64_t kl, std::int64_t ku, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, const float *y,
+                                  std::int64_t incy, float *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, const double *y,
+                                  std::int64_t incy, double *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gerc(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event geru(
+    cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k,
+    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hemv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+    std::int64_t incx, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  float alpha, const std::complex<float> *x, std::int64_t incx,
+                                  std::complex<float> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  double alpha, const std::complex<double> *x, std::int64_t incx,
+                                  std::complex<double> *a, std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event her2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  float alpha, const std::complex<float> *x, std::int64_t incx,
+                                  std::complex<float> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  double alpha, const std::complex<double> *x, std::int64_t incx,
+                                  std::complex<double> *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<float> alpha,
+    const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+    std::int64_t incy, std::complex<float> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event hpr2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
+    std::int64_t incy, std::complex<double> *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
+    const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
+    const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event symv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, float *a,
+                                  std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, double *a,
+                                  std::int64_t lda,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event syr2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *a,
+    const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *a,
+    const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  float alpha, const float *x, std::int64_t incx, float *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                                  double alpha, const double *x, std::int64_t incx, double *a,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *x,
+    std::int64_t incx, const float *y, std::int64_t incy, float *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event spr2(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *x,
+    std::int64_t incx, const double *y, std::int64_t incy, double *a,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
+    float *x, std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
+    double *x, std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
+    float *x, std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
+    double *x, std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
+    std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tbsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
+    std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event tpsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trmv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
+    std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event trsv(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+    onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
+    std::complex<double> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotc(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    const std::complex<float> *y, std::int64_t incy, std::complex<float> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dotu(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamax(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event iamin(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::int64_t *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event asum(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
+    float *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
+    double *y, std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
+    std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+    const std::complex<double> *x, std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+    float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
+    double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+    const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event axpy_batch(
+    cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+    const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+    std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event copy(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                  std::int64_t incx, const float *y, std::int64_t incy,
+                                  float *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                  std::int64_t incx, const double *y, std::int64_t incy,
+                                  double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event sdsdot(
+    cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
+    const float *y, std::int64_t incy, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                  std::int64_t incx, const float *y, std::int64_t incy,
+                                  double *result,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
+    float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
+    double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event nrm2(
+    cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                                  std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                                  float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                                  std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                                  double c, double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x,
+                                  std::int64_t incx, float *y, std::int64_t incy, float c, float s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x,
+                                  std::int64_t incx, double *y, std::int64_t incy, double c,
+                                  double s,
+                                  const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
+    std::complex<float> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotg(
+    cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
+    std::complex<double> *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotm(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotmg(
+    cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event rotmg(
+    cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha, std::complex<float> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x, std::int64_t incx,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event scal(
+    cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+    std::int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+    std::int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
+    std::complex<float> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event swap(
+    cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
+    std::complex<double> *y, std::int64_t incy,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda,
+    const float **b, std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda,
+    const double **b, std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
+    std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
+    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m,
+    std::int64_t *n, std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
+    std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
+    std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+    std::int64_t group_count, std::int64_t *group_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+    std::int64_t stride_a, const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
+    float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
+    std::int64_t stride_a, const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
+    double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+    std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb,
+    std::int64_t stride_b, std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemm_batch(
+    cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+    std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+    std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb,
+    std::int64_t stride_b, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    std::int64_t stride_c, std::int64_t batch_size,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+    onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, const float *a,
+    std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+    onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, const double *a,
+    std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+    onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+    const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT cl::sycl::event gemmt(
+    cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+    onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+    const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+    std::int64_t ldb, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
 } //namespace mklgpu
 } //namespace onemkl
 
diff --git a/include/onemkl/blas/predicates.hpp b/include/onemkl/blas/predicates.hpp
index d485bd1c6..b16e8153e 100644
--- a/include/onemkl/blas/predicates.hpp
+++ b/include/onemkl/blas/predicates.hpp
@@ -30,6 +30,8 @@
 namespace onemkl {
 namespace blas {
 
+// Buffer APIs
+
 inline void herk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
                               std::int64_t n, std::int64_t k, float alpha,
                               cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
@@ -323,118 +325,6 @@ inline void spr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int
 #endif
 }
 
-inline void gemm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-    cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void gemm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-    cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
-inline void gemm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void gemm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-    cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
-inline void gemm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void gemm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<float>, 1> &alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<float>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<float>, 1> &beta,
-    cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
-inline void gemm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void gemm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
 inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
                                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
                                     cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -2996,110 +2886,6 @@ inline void spr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::in
 #endif
 }
 
-inline void trsm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-    cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void trsm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-    cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-    cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
-inline void trsm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void trsm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-    cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
-inline void trsm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void trsm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-    cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
-inline void trsm_batch_precondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add prechecks to queue here for input args.  */
-#endif
-}
-
-inline void trsm_batch_postcondition(
-    cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-    cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-    cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-        /* add postchecks to queue here for input args.  */
-#endif
-}
-
 inline void trsm_batch_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
                                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
                                     float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
@@ -3406,6 +3192,3143 @@ inline void rotg_postcondition(cl::sycl::queue &queue, cl::sycl::buffer<std::com
 #endif
 }
 
+// USM APIs
+
+inline void herk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              std::int64_t n, std::int64_t k, float alpha,
+                              const std::complex<float> *a, std::int64_t lda, float beta,
+                              std::complex<float> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void herk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, float alpha,
+                               const std::complex<float> *a, std::int64_t lda, float beta,
+                               std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void herk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              std::int64_t n, std::int64_t k, double alpha,
+                              const std::complex<double> *a, std::int64_t lda, double beta,
+                              std::complex<double> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void herk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, double alpha,
+                               const std::complex<double> *a, std::int64_t lda, double beta,
+                               std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                              std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                               std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+                              std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+                               std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                              std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                               std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                              std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                               std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, float alpha,
+                              std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, float alpha,
+                               std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                              std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                               std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
+                              float *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
+                               float *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const double *a, std::int64_t lda,
+                              double *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const double *a, std::int64_t lda,
+                               double *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                              std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                               std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                              std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                               std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const float *a, float *x,
+                              std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const float *a, float *x,
+                               std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const double *a, double *x,
+                              std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const double *a, double *x,
+                               std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                              std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                               std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                              std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                               std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void spr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                             const float *x, std::int64_t incx, float *a,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void spr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const float *x, std::int64_t incx, float *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void spr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                             const double *x, std::int64_t incx, double *a,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void spr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const double *x, std::int64_t incx, double *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                    std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha,
+                                    const float **a, std::int64_t *lda, const float **b,
+                                    std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
+                                    std::int64_t group_count, std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                     std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                                     float *alpha, const float **a, std::int64_t *lda,
+                                     const float **b, std::int64_t *ldb, float *beta, float **c,
+                                     std::int64_t *ldc, std::int64_t group_count,
+                                     std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                    std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                                    double *alpha, const double **a, std::int64_t *lda,
+                                    const double **b, std::int64_t *ldb, double *beta, double **c,
+                                    std::int64_t *ldc, std::int64_t group_count,
+                                    std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                     std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                                     double *alpha, const double **a, std::int64_t *lda,
+                                     const double **b, std::int64_t *ldb, double *beta, double **c,
+                                     std::int64_t *ldc, std::int64_t group_count,
+                                     std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                    std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                                    std::complex<float> *alpha, const std::complex<float> **a,
+                                    std::int64_t *lda, const std::complex<float> **b,
+                                    std::int64_t *ldb, std::complex<float> *beta,
+                                    std::complex<float> **c, std::int64_t *ldc,
+                                    std::int64_t group_count, std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                     std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                                     std::complex<float> *alpha, const std::complex<float> **a,
+                                     std::int64_t *lda, const std::complex<float> **b,
+                                     std::int64_t *ldb, std::complex<float> *beta,
+                                     std::complex<float> **c, std::int64_t *ldc,
+                                     std::int64_t group_count, std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                    std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                                    std::complex<double> *alpha, const std::complex<double> **a,
+                                    std::int64_t *lda, const std::complex<double> **b,
+                                    std::int64_t *ldb, std::complex<double> *beta,
+                                    std::complex<double> **c, std::int64_t *ldc,
+                                    std::int64_t group_count, std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                                     std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                                     std::complex<double> *alpha, const std::complex<double> **a,
+                                     std::int64_t *lda, const std::complex<double> **b,
+                                     std::int64_t *ldb, std::complex<double> *beta,
+                                     std::complex<double> **c, std::int64_t *ldc,
+                                     std::int64_t group_count, std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                                    std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                                    const float *a, std::int64_t lda, std::int64_t stride_a,
+                                    const float *b, std::int64_t ldb, std::int64_t stride_b,
+                                    float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
+                                    std::int64_t batch_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                                     const float *a, std::int64_t lda, std::int64_t stride_a,
+                                     const float *b, std::int64_t ldb, std::int64_t stride_b,
+                                     float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
+                                     std::int64_t batch_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                                    std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                                    const double *a, std::int64_t lda, std::int64_t stride_a,
+                                    const double *b, std::int64_t ldb, std::int64_t stride_b,
+                                    double beta, double *c, std::int64_t ldc, std::int64_t stride_c,
+                                    std::int64_t batch_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                                     const double *a, std::int64_t lda, std::int64_t stride_a,
+                                     const double *b, std::int64_t ldb, std::int64_t stride_b,
+                                     double beta, double *c, std::int64_t ldc,
+                                     std::int64_t stride_c, std::int64_t batch_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_precondition(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_batch_postcondition(
+    cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
+    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
+    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+    std::int64_t batch_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              std::int64_t n, std::int64_t k, float alpha, const float *a,
+                              std::int64_t lda, float beta, float *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, float alpha, const float *a,
+                               std::int64_t lda, float beta, float *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              std::int64_t n, std::int64_t k, double alpha, const double *a,
+                              std::int64_t lda, double beta, double *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, double alpha, const double *a,
+                               std::int64_t lda, double beta, double *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                              const std::complex<float> *a, std::int64_t lda,
+                              std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                               const std::complex<float> *a, std::int64_t lda,
+                               std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                              const std::complex<double> *a, std::int64_t lda,
+                              std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                               const std::complex<double> *a, std::int64_t lda,
+                               std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void her2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *x,
+                              std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                              std::complex<float> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void her2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *x,
+                               std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                               std::complex<float> *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void her2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *x,
+                              std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                              std::complex<double> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void her2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *x,
+                               std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                               std::complex<double> *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::int64_t k, std::complex<float> alpha,
+                              const std::complex<float> *a, std::int64_t lda,
+                              const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::int64_t k, std::complex<float> alpha,
+                               const std::complex<float> *a, std::int64_t lda,
+                               const std::complex<float> *x, std::int64_t incx,
+                               std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::int64_t k, std::complex<double> alpha,
+                              const std::complex<double> *a, std::int64_t lda,
+                              const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::int64_t k, std::complex<double> alpha,
+                               const std::complex<double> *a, std::int64_t lda,
+                               const std::complex<double> *x, std::int64_t incx,
+                               std::complex<double> beta, std::complex<double> *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                             std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c,
+                             float s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                              std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c,
+                              float s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                             std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                             double c, double s,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                              std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                              double c, double s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx,
+                             float *y, std::int64_t incy, float c, float s,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx,
+                              float *y, std::int64_t incy, float c, float s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                             double *y, std::int64_t incy, double c, double s,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                              double *y, std::int64_t incy, double c, double s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x,
+                              std::int64_t incx, float *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x,
+                               std::int64_t incx, float *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x,
+                              std::int64_t incx, double *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                               const double *x, std::int64_t incx, double *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                              const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                               const std::complex<float> *x, std::int64_t incx,
+                               std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                              const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                               const std::complex<double> *x, std::int64_t incx,
+                               std::complex<double> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n, float *alpha,
+                                    const float **x, std::int64_t *incx, float **y,
+                                    std::int64_t *incy, std::int64_t group_count,
+                                    std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n, float *alpha,
+                                     const float **x, std::int64_t *incx, float **y,
+                                     std::int64_t *incy, std::int64_t group_count,
+                                     std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n, double *alpha,
+                                    const double **x, std::int64_t *incx, double **y,
+                                    std::int64_t *incy, std::int64_t group_count,
+                                    std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n, double *alpha,
+                                     const double **x, std::int64_t *incx, double **y,
+                                     std::int64_t *incy, std::int64_t group_count,
+                                     std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n,
+                                    std::complex<float> *alpha, const std::complex<float> **x,
+                                    std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+                                    std::int64_t group_count, std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n,
+                                     std::complex<float> *alpha, const std::complex<float> **x,
+                                     std::int64_t *incx, std::complex<float> **y,
+                                     std::int64_t *incy, std::int64_t group_count,
+                                     std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n,
+                                    std::complex<double> *alpha, const std::complex<double> **x,
+                                    std::int64_t *incx, std::complex<double> **y,
+                                    std::int64_t *incy, std::int64_t group_count,
+                                    std::int64_t *group_size,
+                                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n,
+                                     std::complex<double> *alpha, const std::complex<double> **x,
+                                     std::int64_t *incx, std::complex<double> **y,
+                                     std::int64_t *incy, std::int64_t group_count,
+                                     std::int64_t *group_size,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gerc_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *x,
+                              std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                              std::complex<float> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gerc_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *x,
+                               std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                               std::complex<float> *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gerc_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *x,
+                              std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                              std::complex<double> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gerc_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *x,
+                               std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                               std::complex<double> *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, float alpha, const float *a,
+                               std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+                               float *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                                std::int64_t n, std::int64_t k, float alpha, const float *a,
+                                std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+                                float *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, double alpha, const double *a,
+                               std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+                               double *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                                std::int64_t n, std::int64_t k, double alpha, const double *a,
+                                std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+                                double *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                               const std::complex<float> *a, std::int64_t lda,
+                               const std::complex<float> *b, std::int64_t ldb,
+                               std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                                std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                                const std::complex<float> *a, std::int64_t lda,
+                                const std::complex<float> *b, std::int64_t ldb,
+                                std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                               const std::complex<double> *a, std::int64_t lda,
+                               const std::complex<double> *b, std::int64_t ldb,
+                               std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                                std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                                const std::complex<double> *a, std::int64_t lda,
+                                const std::complex<double> *b, std::int64_t ldb,
+                                std::complex<double> beta, std::complex<double> *c,
+                                std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, float alpha, const float *a, std::int64_t lda,
+                              const float *x, std::int64_t incx, float beta, float *y,
+                              std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, float alpha, const float *a, std::int64_t lda,
+                               const float *x, std::int64_t incx, float beta, float *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, double alpha, const double *a, std::int64_t lda,
+                              const double *x, std::int64_t incx, double beta, double *y,
+                              std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, double alpha, const double *a, std::int64_t lda,
+                               const double *x, std::int64_t incx, double beta, double *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, std::complex<float> alpha,
+                              const std::complex<float> *a, std::int64_t lda,
+                              const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, std::complex<float> alpha,
+                               const std::complex<float> *a, std::int64_t lda,
+                               const std::complex<float> *x, std::int64_t incx,
+                               std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, std::complex<double> alpha,
+                              const std::complex<double> *a, std::int64_t lda,
+                              const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, std::complex<double> alpha,
+                               const std::complex<double> *a, std::int64_t lda,
+                               const std::complex<double> *x, std::int64_t incx,
+                               std::complex<double> beta, std::complex<double> *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void her_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                             const std::complex<float> *x, std::int64_t incx,
+                             std::complex<float> *a, std::int64_t lda,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void her_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void her_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                             const std::complex<double> *x, std::int64_t incx,
+                             std::complex<double> *a, std::int64_t lda,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void her_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                             const std::complex<float> *x, std::int64_t incx,
+                             std::complex<float> *a,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                             const std::complex<double> *x, std::int64_t incx,
+                             std::complex<double> *a,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                               std::int64_t incx, std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                std::int64_t incx, std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                               std::int64_t incx, std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                std::int64_t incx, std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                               std::int64_t incx, std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                                const std::complex<float> *x, std::int64_t incx,
+                                std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n,
+                               const std::complex<double> *x, std::int64_t incx,
+                               std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                                const std::complex<double> *x, std::int64_t incx,
+                                std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *a,
+                              const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *a,
+                               const std::complex<float> *x, std::int64_t incx,
+                               std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *a,
+                              const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *a,
+                               const std::complex<double> *x, std::int64_t incx,
+                               std::complex<double> beta, std::complex<double> *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void spmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const float *a, const float *x, std::int64_t incx, float beta,
+                              float *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void spmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               float alpha, const float *a, const float *x, std::int64_t incx,
+                               float beta, float *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void spmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const double *a, const double *x, std::int64_t incx,
+                              double beta, double *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void spmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               double alpha, const double *a, const double *x, std::int64_t incx,
+                               double beta, double *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotmg_precondition(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
+                               float *param,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotmg_postcondition(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
+                                float *param,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotmg_precondition(cl::sycl::queue &queue, double *d1, double *d2, double *x1,
+                               double y1, double *param,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotmg_postcondition(cl::sycl::queue &queue, double *d1, double *d2, double *x1,
+                                double y1, double *param,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx,
+                              float *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx,
+                               float *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                              double *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                               double *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                              std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                               std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                              std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                               std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void geru_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *x,
+                              std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                              std::complex<float> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void geru_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *x,
+                               std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                               std::complex<float> *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void geru_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *x,
+                              std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                              std::complex<double> *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void geru_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *x,
+                               std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                               std::complex<double> *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                              std::int64_t incx, float *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                               std::int64_t incx, float *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                              std::int64_t incx, double *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                               const std::complex<double> *x, std::int64_t incx, double *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                              std::int64_t incx, float *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                               std::int64_t incx, float *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                              std::int64_t incx, double *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                               std::int64_t incx, double *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                               transpose transb, std::int64_t n, std::int64_t k, float alpha,
+                               const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+                               float beta, float *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                                transpose transb, std::int64_t n, std::int64_t k, float alpha,
+                                const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+                                float beta, float *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                               transpose transb, std::int64_t n, std::int64_t k, double alpha,
+                               const double *a, std::int64_t lda, const double *b, std::int64_t ldb,
+                               double beta, double *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                                transpose transb, std::int64_t n, std::int64_t k, double alpha,
+                                const double *a, std::int64_t lda, const double *b,
+                                std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                               transpose transb, std::int64_t n, std::int64_t k,
+                               std::complex<float> alpha, const std::complex<float> *a,
+                               std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+                               std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                                transpose transb, std::int64_t n, std::int64_t k,
+                                std::complex<float> alpha, const std::complex<float> *a,
+                                std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+                                std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                               transpose transb, std::int64_t n, std::int64_t k,
+                               std::complex<double> alpha, const std::complex<double> *a,
+                               std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                               std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                                transpose transb, std::int64_t n, std::int64_t k,
+                                std::complex<double> alpha, const std::complex<double> *a,
+                                std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                                std::complex<double> beta, std::complex<double> *c,
+                                std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                              const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+                              float beta, float *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                               const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+                               float beta, float *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                              const double *a, std::int64_t lda, const double *b, std::int64_t ldb,
+                              double beta, double *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                               std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                               const double *a, std::int64_t lda, const double *b, std::int64_t ldb,
+                               double beta, double *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k,
+                              std::complex<float> alpha, const std::complex<float> *a,
+                              std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+                              std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                               std::int64_t m, std::int64_t n, std::int64_t k,
+                               std::complex<float> alpha, const std::complex<float> *a,
+                               std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
+                               std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k,
+                              std::complex<double> alpha, const std::complex<double> *a,
+                              std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                              std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                               std::int64_t m, std::int64_t n, std::int64_t k,
+                               std::complex<double> alpha, const std::complex<double> *a,
+                               std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
+                               std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                              std::int64_t m, std::int64_t n, std::int64_t k, half alpha,
+                              const half *a, std::int64_t lda, const half *b, std::int64_t ldb,
+                              half beta, half *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb,
+                               std::int64_t m, std::int64_t n, std::int64_t k, half alpha,
+                               const half *a, std::int64_t lda, const half *b, std::int64_t ldb,
+                               half beta, half *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const float *x, std::int64_t incx, const float *y, std::int64_t incy,
+                              float *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               float alpha, const float *x, std::int64_t incx, const float *y,
+                               std::int64_t incy, float *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const double *x, std::int64_t incx, const double *y,
+                              std::int64_t incy, double *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               double alpha, const double *x, std::int64_t incx, const double *y,
+                               std::int64_t incy, double *a, std::int64_t lda,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void ger_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+                             const float *x, std::int64_t incx, const float *y, std::int64_t incy,
+                             float *a, std::int64_t lda,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void ger_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+                              const float *x, std::int64_t incx, const float *y, std::int64_t incy,
+                              float *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void ger_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+                             const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                             double *a, std::int64_t lda,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void ger_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+                              const double *x, std::int64_t incx, const double *y,
+                              std::int64_t incy, double *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              float alpha, const float *a, std::int64_t lda, float *b,
+                              std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               float alpha, const float *a, std::int64_t lda, float *b,
+                               std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              double alpha, const double *a, std::int64_t lda, double *b,
+                              std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               double alpha, const double *a, std::int64_t lda, double *b,
+                               std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *a,
+                              std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *a,
+                               std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *a,
+                              std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *a,
+                               std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void dotu_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                              std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                              std::complex<float> *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void dotu_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                               std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                               std::complex<float> *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void dotu_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                              std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                              std::complex<double> *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void dotu_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                               const std::complex<double> *x, std::int64_t incx,
+                               const std::complex<double> *y, std::int64_t incy,
+                               std::complex<double> *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hemm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                              const std::complex<float> *a, std::int64_t lda,
+                              const std::complex<float> *b, std::int64_t ldb,
+                              std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hemm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                               const std::complex<float> *a, std::int64_t lda,
+                               const std::complex<float> *b, std::int64_t ldb,
+                               std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hemm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                              const std::complex<double> *a, std::int64_t lda,
+                              const std::complex<double> *b, std::int64_t ldb,
+                              std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hemm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                               const std::complex<double> *a, std::int64_t lda,
+                               const std::complex<double> *b, std::int64_t ldb,
+                               std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *x,
+                              std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                              std::complex<float> *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *x,
+                               std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                               std::complex<float> *a,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *x,
+                              std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                              std::complex<double> *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hpr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *x,
+                               std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                               std::complex<double> *a,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
+                              const float *a, std::int64_t lda, const float *x, std::int64_t incx,
+                              float beta, float *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
+                               const float *a, std::int64_t lda, const float *x, std::int64_t incx,
+                               float beta, float *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
+                              const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                              double beta, double *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
+                               const double *a, std::int64_t lda, const double *x,
+                               std::int64_t incx, double beta, double *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, std::int64_t kl, std::int64_t ku,
+                              std::complex<float> alpha, const std::complex<float> *a,
+                              std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, std::int64_t kl, std::int64_t ku,
+                               std::complex<float> alpha, const std::complex<float> *a,
+                               std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                               std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                              std::int64_t n, std::int64_t kl, std::int64_t ku,
+                              std::complex<double> alpha, const std::complex<double> *a,
+                              std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                               std::int64_t n, std::int64_t kl, std::int64_t ku,
+                               std::complex<double> alpha, const std::complex<double> *a,
+                               std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                               std::complex<double> beta, std::complex<double> *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                              std::int64_t lda, float *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                               std::int64_t lda, float *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                              std::int64_t lda, double *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                               std::int64_t lda, double *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k,
+                              const std::complex<float> *a, std::int64_t lda,
+                              std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k,
+                               const std::complex<float> *a, std::int64_t lda,
+                               std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k,
+                              const std::complex<double> *a, std::int64_t lda,
+                              std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k,
+                               const std::complex<double> *a, std::int64_t lda,
+                               std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              std::int64_t m, std::int64_t n, float alpha, const float *a,
+                              std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+                              float *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               std::int64_t m, std::int64_t n, float alpha, const float *a,
+                               std::int64_t lda, const float *b, std::int64_t ldb, float beta,
+                               float *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              std::int64_t m, std::int64_t n, double alpha, const double *a,
+                              std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+                              double *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               std::int64_t m, std::int64_t n, double alpha, const double *a,
+                               std::int64_t lda, const double *b, std::int64_t ldb, double beta,
+                               double *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                              const std::complex<float> *a, std::int64_t lda,
+                              const std::complex<float> *b, std::int64_t ldb,
+                              std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                               const std::complex<float> *a, std::int64_t lda,
+                               const std::complex<float> *b, std::int64_t ldb,
+                               std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                              const std::complex<double> *a, std::int64_t lda,
+                              const std::complex<double> *b, std::int64_t ldb,
+                              std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                               const std::complex<double> *a, std::int64_t lda,
+                               const std::complex<double> *b, std::int64_t ldb,
+                               std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void dotc_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                              std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                              std::complex<float> *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void dotc_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                               std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                               std::complex<float> *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void dotc_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                              std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                              std::complex<double> *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void dotc_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                               const std::complex<double> *x, std::int64_t incx,
+                               const std::complex<double> *y, std::int64_t incy,
+                               std::complex<double> *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                             const float *x, std::int64_t incx, float *a, std::int64_t lda,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const float *x, std::int64_t incx, float *a, std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void syr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
+                             const double *x, std::int64_t incx, double *a, std::int64_t lda,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void syr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const double *x, std::int64_t incx, double *a,
+                              std::int64_t lda,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              float alpha, const float *a, std::int64_t lda, float *b,
+                              std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               float alpha, const float *a, std::int64_t lda, float *b,
+                               std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              double alpha, const double *a, std::int64_t lda, double *b,
+                              std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               double alpha, const double *a, std::int64_t lda, double *b,
+                               std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *a,
+                              std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *a,
+                               std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *a,
+                              std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                               transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *a,
+                               std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void symv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const float *a, std::int64_t lda, const float *x, std::int64_t incx,
+                              float beta, float *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void symv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               float alpha, const float *a, std::int64_t lda, const float *x,
+                               std::int64_t incx, float beta, float *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void symv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const double *a, std::int64_t lda, const double *x,
+                              std::int64_t incx, double beta, double *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void symv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               double alpha, const double *a, std::int64_t lda, const double *x,
+                               std::int64_t incx, double beta, double *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const float *a, float *x,
+                              std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const float *a, float *x,
+                               std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const double *a, double *x,
+                              std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const double *a, double *x,
+                               std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                              std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                               std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                              std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                               std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
+                              float *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
+                               float *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const double *a, std::int64_t lda,
+                              double *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const double *a, std::int64_t lda,
+                               double *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                              std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                               std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                              std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                               std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                              std::int64_t incx, float *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                               std::int64_t incx, float *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                              std::int64_t incx, double *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                               std::int64_t incx, double *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                              std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                               std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                              std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                               const std::complex<double> *x, std::int64_t incx,
+                               std::complex<double> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hemv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<float> alpha, const std::complex<float> *a,
+                              std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                              std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hemv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<float> alpha, const std::complex<float> *a,
+                               std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                               std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void hemv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::complex<double> alpha, const std::complex<double> *a,
+                              std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                              std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void hemv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::complex<double> alpha, const std::complex<double> *a,
+                               std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                               std::complex<double> beta, std::complex<double> *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                               std::int64_t incx, std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                std::int64_t incx, std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                               std::int64_t incx, std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                std::int64_t incx, std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                               std::int64_t incx, std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                                const std::complex<float> *x, std::int64_t incx,
+                                std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n,
+                               const std::complex<double> *x, std::int64_t incx,
+                               std::int64_t *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                                const std::complex<double> *x, std::int64_t incx,
+                                std::int64_t *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void sbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                              const float *x, std::int64_t incx, float beta, float *y,
+                              std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void sbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                               const float *x, std::int64_t incx, float beta, float *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void sbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              std::int64_t k, double alpha, const double *a, std::int64_t lda,
+                              const double *x, std::int64_t incx, double beta, double *y,
+                              std::int64_t incy,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void sbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               std::int64_t k, double alpha, const double *a, std::int64_t lda,
+                               const double *x, std::int64_t incx, double beta, double *y,
+                               std::int64_t incy,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                              std::int64_t incx, float *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                               std::int64_t incx, float *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                              std::int64_t incx, double *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n,
+                               const std::complex<double> *x, std::int64_t incx, double *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                              std::int64_t incx, float *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                               std::int64_t incx, float *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                              std::int64_t incx, double *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                               std::int64_t incx, double *result,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                              std::int64_t lda, float *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                               std::int64_t lda, float *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                              std::int64_t lda, double *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                               std::int64_t lda, double *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k,
+                              const std::complex<float> *a, std::int64_t lda,
+                              std::complex<float> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k,
+                               const std::complex<float> *a, std::int64_t lda,
+                               std::complex<float> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                              diag unit_diag, std::int64_t n, std::int64_t k,
+                              const std::complex<double> *a, std::int64_t lda,
+                              std::complex<double> *x, std::int64_t incx,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               diag unit_diag, std::int64_t n, std::int64_t k,
+                               const std::complex<double> *a, std::int64_t lda,
+                               std::complex<double> *x, std::int64_t incx,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void spr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
+                              const float *x, std::int64_t incx, const float *y, std::int64_t incy,
+                              float *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void spr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               float alpha, const float *x, std::int64_t incx, const float *y,
+                               std::int64_t incy, float *a,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void spr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                              double alpha, const double *x, std::int64_t incx, const double *y,
+                              std::int64_t incy, double *a,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void spr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                               double alpha, const double *x, std::int64_t incx, const double *y,
+                               std::int64_t incy, double *a,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotm_precondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx,
+                              float *y, std::int64_t incy, float *param,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotm_postcondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx,
+                               float *y, std::int64_t incy, float *param,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotm_precondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                              double *y, std::int64_t incy, double *param,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotm_postcondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                               double *y, std::int64_t incy, double *param,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void dot_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                             std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void dot_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                              std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void dot_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                             std::int64_t incx, const double *y, std::int64_t incy, double *result,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void dot_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                              std::int64_t incx, const double *y, std::int64_t incy, double *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void dot_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                             std::int64_t incx, const float *y, std::int64_t incy, double *result,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void dot_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                              std::int64_t incx, const float *y, std::int64_t incy, double *result,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void sdsdot_precondition(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x,
+                                std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void sdsdot_postcondition(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x,
+                                 std::int64_t incx, const float *y, std::int64_t incy,
+                                 float *result,
+                                 const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void her2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                               const std::complex<float> *a, std::int64_t lda,
+                               const std::complex<float> *b, std::int64_t ldb, float beta,
+                               std::complex<float> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void her2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                                std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                                const std::complex<float> *a, std::int64_t lda,
+                                const std::complex<float> *b, std::int64_t ldb, float beta,
+                                std::complex<float> *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void her2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                               std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                               const std::complex<double> *a, std::int64_t lda,
+                               const std::complex<double> *b, std::int64_t ldb, double beta,
+                               std::complex<double> *c, std::int64_t ldc,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void her2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                                std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                                const std::complex<double> *a, std::int64_t lda,
+                                const std::complex<double> *b, std::int64_t ldb, double beta,
+                                std::complex<double> *c, std::int64_t ldc,
+                                const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_precondition(cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_postcondition(cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_precondition(cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_postcondition(cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_precondition(cl::sycl::queue &queue, std::complex<float> *a,
+                              std::complex<float> *b, float *c, std::complex<float> *s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_postcondition(cl::sycl::queue &queue, std::complex<float> *a,
+                               std::complex<float> *b, float *c, std::complex<float> *s,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_precondition(cl::sycl::queue &queue, std::complex<double> *a,
+                              std::complex<double> *b, double *c, std::complex<double> *s,
+                              const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add prechecks to queue here for input args.  */
+#endif
+}
+
+inline void rotg_postcondition(cl::sycl::queue &queue, std::complex<double> *a,
+                               std::complex<double> *b, double *c, std::complex<double> *s,
+                               const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+#ifndef ONEMKL_DISABLE_PREDICATES
+        /* add postchecks to queue here for input args.  */
+#endif
+}
+
 } //namespace blas
 } //namespace onemkl
 
diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp
index e04b86b62..39a710048 100644
--- a/src/blas/backends/cublas/cublas_batch.cpp
+++ b/src/blas/backends/cublas/cublas_batch.cpp
@@ -16,64 +16,21 @@
 *  limitations under the License.
 *
 **************************************************************************/
-#include <stdexcept>
+#include "cublas_helper.hpp"
+#include "include/exceptions_helper.hpp"
 #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
 namespace onemkl {
 namespace cublas {
 
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-                cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-                cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-                cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-                cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<std::complex<float>, 1> &beta,
-                cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
-
-void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
+// Buffer APIs
 
 void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
                 std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
                 std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer<float, 1> &b,
                 std::int64_t ldb, std::int64_t stride_b, float beta, cl::sycl::buffer<float, 1> &c,
                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
@@ -82,7 +39,7 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std:
                 std::int64_t ldb, std::int64_t stride_b, double beta,
                 cl::sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
                 std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
@@ -92,7 +49,7 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std:
                 std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
                 cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
                 std::int64_t stride_c, std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
@@ -102,51 +59,7 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std:
                 std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
                 cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
                 std::int64_t stride_c, std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::complex<double>, 1> &alpha,
-                cl::sycl::buffer<std::complex<double>, 1> &a,
-                cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<double>, 1> &b,
-                cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
@@ -154,7 +67,7 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
                 cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
                 cl::sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
                 std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
@@ -162,7 +75,7 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
                 cl::sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
                 cl::sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
                 std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
@@ -170,7 +83,7 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
                 cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
                 std::int64_t stride_a, cl::sycl::buffer<std::complex<float>, 1> &b,
                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
@@ -178,7 +91,111 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
                 cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
                 std::int64_t stride_a, cl::sycl::buffer<std::complex<double>, 1> &b,
                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
+}
+
+// USM APIs
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda,
+                           const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
+                           int64_t group_count, int64_t *groupsize,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda,
+                           const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc,
+                           int64_t group_count, int64_t *groupsize,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, std::complex<float> *alpha,
+                           const std::complex<float> **a, int64_t *lda,
+                           const std::complex<float> **b, int64_t *ldb, std::complex<float> *beta,
+                           std::complex<float> **c, int64_t *ldc, int64_t group_count,
+                           int64_t *groupsize,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, std::complex<double> *alpha,
+                           const std::complex<double> **a, int64_t *lda,
+                           const std::complex<double> **b, int64_t *ldb, std::complex<double> *beta,
+                           std::complex<double> **c, int64_t *ldc, int64_t group_count,
+                           int64_t *groupsize,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
+                           int64_t stride_a, const float *b, int64_t ldb, int64_t stride_b,
+                           float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
+                           int64_t stride_a, const double *b, int64_t ldb, int64_t stride_b,
+                           double beta, double *c, int64_t ldc, int64_t stride_c,
+                           int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
 }
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, std::complex<float> alpha,
+                           const std::complex<float> *a, int64_t lda, int64_t stride_a,
+                           const std::complex<float> *b, int64_t ldb, int64_t stride_b,
+                           std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                           int64_t stride_c, int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, std::complex<double> alpha,
+                           const std::complex<double> *a, int64_t lda, int64_t stride_a,
+                           const std::complex<double> *b, int64_t ldb, int64_t stride_b,
+                           std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                           int64_t stride_c, int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, float *alpha, const float **x,
+                           int64_t *incx, float **y, int64_t *incy, int64_t group_count,
+                           int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, double *alpha, const double **x,
+                           int64_t *incx, double **y, int64_t *incy, int64_t group_count,
+                           int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex<float> *alpha,
+                           const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
+                           int64_t *incy, int64_t group_count, int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex<double> *alpha,
+                           const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
+                           int64_t *incy, int64_t group_count, int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
 } // namespace cublas
 } // namespace onemkl
diff --git a/src/blas/backends/cublas/cublas_extensions.cpp b/src/blas/backends/cublas/cublas_extensions.cpp
index 6d17b34e5..b75ce5e5c 100644
--- a/src/blas/backends/cublas/cublas_extensions.cpp
+++ b/src/blas/backends/cublas/cublas_extensions.cpp
@@ -16,26 +16,29 @@
 *  limitations under the License.
 *
 **************************************************************************/
-#include <stdexcept>
+#include "cublas_helper.hpp"
+#include "include/exceptions_helper.hpp"
 #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
 namespace onemkl {
 namespace cublas {
 
+// Buffer APIs
+
 // BLAS-like extensions
 
 void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
            std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
            std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
            cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
            std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
            std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
            cl::sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
@@ -43,7 +46,7 @@ void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose
            cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
            cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
@@ -52,14 +55,14 @@ void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose
            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
            std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
               std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<half, 1> &a,
               std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, float beta,
               cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
@@ -67,21 +70,21 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, offset
               cl::sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
               cl::sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
               cl::sycl::buffer<int32_t, 1> &c, std::int64_t ldc, cl::sycl::buffer<int32_t, 1> &co) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
               std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
               std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
               cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
               std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
               std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
               cl::sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
@@ -90,7 +93,7 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::i
               cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
               std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
               std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
@@ -99,14 +102,48 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::i
               cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
               std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
               std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
 }
 
 void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
               std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer<half, 1> &a,
               std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
               cl::sycl::buffer<half, 1> &c, std::int64_t ldc) {
-    throw std::runtime_error("Not implemented for cublas");
+    throw backend_unsupported_exception();
+}
+
+// USM APIs
+
+// BLAS-like extensions
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
+                      const float *b, int64_t ldb, float beta, float *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
+                      const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                      int64_t lda, const std::complex<float> *b, int64_t ldb,
+                      std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, int64_t lda, const std::complex<double> *b,
+                      int64_t ldb, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
 }
 
 } // namespace cublas
diff --git a/src/blas/backends/cublas/cublas_helper.hpp b/src/blas/backends/cublas/cublas_helper.hpp
index 1304acdba..2ae92c56a 100644
--- a/src/blas/backends/cublas/cublas_helper.hpp
+++ b/src/blas/backends/cublas/cublas_helper.hpp
@@ -26,8 +26,8 @@
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <complex>
-#include <stdexcept>
 #include "onemkl/types.hpp"
+
 namespace onemkl {
 namespace cublas {
 
diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp
index e8bf9f937..866624f4f 100644
--- a/src/blas/backends/cublas/cublas_level1.cpp
+++ b/src/blas/backends/cublas/cublas_level1.cpp
@@ -18,12 +18,16 @@
 **************************************************************************/
 #include "cublas_helper.hpp"
 #include "cublas_scope_handle.hpp"
+#include "include/exceptions_helper.hpp"
 #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
 #include <CL/sycl/detail/pi.hpp>
 
 namespace onemkl {
 namespace cublas {
+
+// Buffer APIs
+
 // Level 1
 template <typename Func, typename T1, typename T2>
 inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<T1, 1> &x,
@@ -602,5 +606,269 @@ NRM2_LAUNCHER(std::complex<float>, float, cublasScnrm2)
 NRM2_LAUNCHER(std::complex<double>, double, cublasDznrm2)
 #undef NRM2_LAUNCHER
 
+// USM APIs
+
+// Level 1
+template <typename Func, typename T1, typename T2>
+inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const T1 *x,
+                            const int64_t incx, T2 *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define ASUM_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                         \
+    cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \
+                         TYPE2 *result,                                                         \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {         \
+        return asum(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies);                   \
+    }
+ASUM_LAUNCHER_USM(float, float, cublasSasum)
+ASUM_LAUNCHER_USM(double, double, cublasDasum)
+ASUM_LAUNCHER_USM(std::complex<float>, float, cublasScasum)
+ASUM_LAUNCHER_USM(std::complex<double>, double, cublasDzasum)
+#undef ASUM_LAUNCHER_USM
+
+template <typename Func, typename T1, typename T2>
+inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, T2 *x, int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SCAL_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                      \
+    cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {      \
+        return scal(CUBLAS_ROUTINE, queue, n, a, x, incx, dependencies);                     \
+    }
+SCAL_LAUNCHER_USM(float, float, cublasSscal)
+SCAL_LAUNCHER_USM(double, double, cublasDscal)
+SCAL_LAUNCHER_USM(std::complex<float>, std::complex<float>, cublasCscal)
+SCAL_LAUNCHER_USM(std::complex<double>, std::complex<double>, cublasZscal)
+SCAL_LAUNCHER_USM(float, std::complex<float>, cublasCsscal)
+SCAL_LAUNCHER_USM(double, std::complex<double>, cublasZdscal)
+#undef SCAL_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, const T *x,
+                            int64_t incx, T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define AXPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                         \
+    cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x,  \
+                         int64_t incx, TYPE *y, int64_t incy,                           \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) { \
+        return axpy(CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, dependencies);   \
+    }
+
+AXPY_LAUNCHER_USM(float, cublasSaxpy)
+AXPY_LAUNCHER_USM(double, cublasDaxpy)
+AXPY_LAUNCHER_USM(std::complex<float>, cublasCaxpy)
+AXPY_LAUNCHER_USM(std::complex<double>, cublasZaxpy)
+#undef AXPY_LAUNCHER_USM
+
+template <typename Func, typename T1, typename T2>
+inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2 *c, T1 *s,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define ROTG_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                  \
+    cl::sycl::event rotg(cl::sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {  \
+        return rotg(CUBLAS_ROUTINE, queue, a, b, c, s, dependencies);                    \
+    }
+
+ROTG_LAUNCHER_USM(float, float, cublasSrotg)
+ROTG_LAUNCHER_USM(double, double, cublasDrotg)
+ROTG_LAUNCHER_USM(std::complex<float>, float, cublasCrotg)
+ROTG_LAUNCHER_USM(std::complex<double>, double, cublasZrotg)
+#undef ROTG_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y,
+                            int64_t incy, T *param,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define ROTM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    cl::sycl::event rotm(cl::sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, \
+                         int64_t incy, TYPE *param,                                         \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {     \
+        return rotm(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param, dependencies);       \
+    }
+
+ROTM_LAUNCHER_USM(float, cublasSrotm)
+ROTM_LAUNCHER_USM(double, cublasDrotm)
+#undef ROTM_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const T *x, int64_t incx,
+                            T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define COPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
+    cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \
+                         int64_t incy,                                                            \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return copy(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);                    \
+    }
+
+COPY_LAUNCHER_USM(float, cublasScopy)
+COPY_LAUNCHER_USM(double, cublasDcopy)
+COPY_LAUNCHER_USM(std::complex<float>, cublasCcopy)
+COPY_LAUNCHER_USM(std::complex<double>, cublasZcopy)
+#undef COPY_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T *x,
+                           const int64_t incx, const T *y, int64_t incy, T *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define DOT_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE)                                                \
+    cl::sycl::event dot##EXT(cl::sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
+                             const TYPE *y, const int64_t incy, TYPE *result,                      \
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {        \
+        return dot(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result, dependencies);              \
+    }
+DOT_LAUNCHER_USM(, float, cublasSdot)
+DOT_LAUNCHER_USM(, double, cublasDdot)
+DOT_LAUNCHER_USM(c, std::complex<float>, cublasCdotc)
+DOT_LAUNCHER_USM(c, std::complex<double>, cublasZdotc)
+DOT_LAUNCHER_USM(u, std::complex<float>, cublasCdotu)
+DOT_LAUNCHER_USM(u, std::complex<double>, cublasZdotu)
+#undef DOT_LAUNCHER_USM
+
+template <typename Func, typename T1, typename T2, typename T3>
+inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x, const int64_t incx,
+                           T1 *y, int64_t incy, T2 c, T3 s,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE)                                      \
+    cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \
+                        int64_t incy, TYPE2 c, TYPE3 s,                                            \
+                        const cl::sycl::vector_class<cl::sycl::event> &dependencies) {             \
+        return rot(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, dependencies);                \
+    }
+
+ROT_LAUNCHER_USM(float, float, float, cublasSrot)
+ROT_LAUNCHER_USM(double, double, double, cublasDrot)
+ROT_LAUNCHER_USM(std::complex<float>, float, float, cublasCsrot)
+ROT_LAUNCHER_USM(std::complex<double>, double, double, cublasZdrot)
+#undef ROT_LAUNCHER_USM
+
+cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
+                       const float *y, int64_t incy, float *result,
+                       const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
+                    int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+template <typename Func, typename T>
+inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T *x1, T y1, T *param,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define ROTMG_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                         \
+    cl::sycl::event rotmg(cl::sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, \
+                          TYPE *param,                                                   \
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies) { \
+        return rotmg(CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies);        \
+    }
+
+ROTMG_LAUNCHER_USM(float, cublasSrotmg)
+ROTMG_LAUNCHER_USM(double, cublasDrotmg)
+#undef ROTMG_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const T *x,
+                             const int64_t incx, int64_t *result,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
+    cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
+                          int64_t *result,                                                      \
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies) {        \
+        return iamax(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies);                  \
+    }
+IAMAX_LAUNCHER_USM(float, cublasIsamax)
+IAMAX_LAUNCHER_USM(double, cublasIdamax)
+IAMAX_LAUNCHER_USM(std::complex<float>, cublasIcamax)
+IAMAX_LAUNCHER_USM(std::complex<double>, cublasIzamax)
+#undef IAMAX_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y,
+                            int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SWAP_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, \
+                         int64_t incy,                                                      \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {     \
+        return swap(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);              \
+    }
+
+SWAP_LAUNCHER_USM(float, cublasSswap)
+SWAP_LAUNCHER_USM(double, cublasDswap)
+SWAP_LAUNCHER_USM(std::complex<float>, cublasCswap)
+SWAP_LAUNCHER_USM(std::complex<double>, cublasZswap)
+#undef SWAP_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const T *x,
+                             const int64_t incx, int64_t *result,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
+    cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
+                          int64_t *result,                                                      \
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies) {        \
+        return iamin(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies);                  \
+    }
+IAMIN_LAUNCHER_USM(float, cublasIsamin)
+IAMIN_LAUNCHER_USM(double, cublasIdamin)
+IAMIN_LAUNCHER_USM(std::complex<float>, cublasIcamin)
+IAMIN_LAUNCHER_USM(std::complex<double>, cublasIzamin)
+#undef IAMIN_LAUNCHER_USM
+
+template <typename Func, typename T1, typename T2>
+inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const T1 *x,
+                            const int64_t incx, T2 *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define NRM2_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                         \
+    cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \
+                         TYPE2 *result,                                                         \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {         \
+        return nrm2(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies);                   \
+    }
+NRM2_LAUNCHER_USM(float, float, cublasSnrm2)
+NRM2_LAUNCHER_USM(double, double, cublasDnrm2)
+NRM2_LAUNCHER_USM(std::complex<float>, float, cublasScnrm2)
+NRM2_LAUNCHER_USM(std::complex<double>, double, cublasDznrm2)
+#undef NRM2_LAUNCHER_USM
+
 } // namespace cublas
 } // namespace onemkl
diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp
index 25fda3ba5..69989c4c5 100644
--- a/src/blas/backends/cublas/cublas_level2.cpp
+++ b/src/blas/backends/cublas/cublas_level2.cpp
@@ -19,10 +19,14 @@
 #include <CL/sycl/detail/pi.hpp>
 #include "cublas_helper.hpp"
 #include "cublas_scope_handle.hpp"
+#include "include/exceptions_helper.hpp"
 #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
 namespace onemkl {
 namespace cublas {
+
+// Buffer APIs
+
 template <typename Func, typename T>
 inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, T alpha,
                  cl::sycl::buffer<T, 1> &a, int64_t lda, cl::sycl::buffer<T, 1> &x, int64_t incx,
@@ -840,5 +844,506 @@ TRSV_LAUNCHER(std::complex<double>, cublasZtrsv)
 
 #undef TRSV_LAUNCHER
 
+// USM APIs
+
+template <typename Func, typename T>
+inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m,
+                            int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx,
+                            T beta, T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define GEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
+    cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n,        \
+                         TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, int64_t incx,  \
+                         TYPE beta, TYPE *y, int64_t incy,                                     \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {        \
+        return gemv(CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, \
+                    dependencies);                                                             \
+    }
+
+GEMV_LAUNCHER_USM(float, cublasSgemv)
+GEMV_LAUNCHER_USM(double, cublasDgemv)
+GEMV_LAUNCHER_USM(std::complex<float>, cublasCgemv)
+GEMV_LAUNCHER_USM(std::complex<double>, cublasZgemv)
+#undef GEMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m,
+                            int64_t n, int64_t kl, int64_t ku, T alpha, const T *a, int64_t lda,
+                            const T *x, int64_t incx, T beta, T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define GBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
+    cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n,          \
+                         int64_t kl, int64_t ku, TYPE alpha, const TYPE *a, int64_t lda,         \
+                         const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,          \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {          \
+        return gbmv(CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, \
+                    incy, dependencies);                                                         \
+    }
+
+GBMV_LAUNCHER_USM(float, cublasSgbmv)
+GBMV_LAUNCHER_USM(double, cublasDgbmv)
+GBMV_LAUNCHER_USM(std::complex<float>, cublasCgbmv)
+GBMV_LAUNCHER_USM(std::complex<double>, cublasZgbmv)
+#undef GBMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha,
+                           const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define GER_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE)                                             \
+    cl::sycl::event ger##EXT(cl::sycl::queue &queue, int64_t m, int64_t n, TYPE alpha,          \
+                             const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \
+                             int64_t lda,                                                       \
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {     \
+        return ger(CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); \
+    }
+
+GER_LAUNCHER_USM(, float, cublasSger)
+GER_LAUNCHER_USM(, double, cublasDger)
+GER_LAUNCHER_USM(u, std::complex<float>, cublasCgeru)
+GER_LAUNCHER_USM(u, std::complex<double>, cublasZgeru)
+GER_LAUNCHER_USM(c, std::complex<float>, cublasCgerc)
+GER_LAUNCHER_USM(c, std::complex<double>, cublasZgerc)
+#undef GER_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                            int64_t k, T alpha, const T *a, int64_t lda, const T *x, int64_t incx,
+                            T beta, T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
+    cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,       \
+                         TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, int64_t incx,  \
+                         TYPE beta, TYPE *y, int64_t incy,                                     \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {        \
+        return hbmv(CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \
+                    incy, dependencies);                                                       \
+    }
+
+HBMV_LAUNCHER_USM(std::complex<float>, cublasChbmv)
+HBMV_LAUNCHER_USM(std::complex<double>, cublasZhbmv)
+#undef HBMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
+                            int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
+    cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,         \
+                         const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta,      \
+                         TYPE *y, int64_t incy,                                                   \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return hemv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \
+                    dependencies);                                                                \
+    }
+
+HEMV_LAUNCHER_USM(std::complex<float>, cublasChemv)
+HEMV_LAUNCHER_USM(std::complex<double>, cublasZhemv)
+#undef HEMV_LAUNCHER_USM
+
+template <typename Func, typename ScalarType, typename DataType>
+inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                           ScalarType alpha, const DataType *x, int64_t incx, DataType *a,
+                           int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                                 \
+    cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,  \
+                        const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda,             \
+                        const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return her(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \
+    }
+
+HER_LAUNCHER_USM(float, std::complex<float>, cublasCher)
+HER_LAUNCHER_USM(double, std::complex<double>, cublasZher)
+
+#undef HER_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HER2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,   \
+                         const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \
+                         int64_t lda,                                                       \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {     \
+        return her2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \
+                    dependencies);                                                          \
+    }
+
+HER2_LAUNCHER_USM(std::complex<float>, cublasCher2)
+HER2_LAUNCHER_USM(std::complex<double>, cublasZher2)
+
+#undef HER2_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                              \
+    cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,    \
+                         const TYPE *a, const TYPE *x, int64_t incx, TYPE beta, TYPE *y,     \
+                         int64_t incy,                                                       \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {      \
+        return hpmv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \
+                    dependencies);                                                           \
+    }
+
+HPMV_LAUNCHER_USM(std::complex<float>, cublasChpmv)
+HPMV_LAUNCHER_USM(std::complex<double>, cublasZhpmv)
+
+#undef HPMV_LAUNCHER_USM
+
+template <typename Func, typename ScalarType, typename DataType>
+inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                           ScalarType alpha, const DataType *x, int64_t incx, DataType *a,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                                \
+    cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \
+                        const DATA_TYPE *x, int64_t incx, DATA_TYPE *a,                         \
+                        const cl::sycl::vector_class<cl::sycl::event> &dependencies) {          \
+        return hpr(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies);     \
+    }
+
+HPR_LAUNCHER_USM(float, std::complex<float>, cublasChpr)
+HPR_LAUNCHER_USM(double, std::complex<double>, cublasZhpr)
+
+#undef HPR_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *x, int64_t incx, const T *y, int64_t incy, T *a,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,   \
+                         const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {     \
+        return hpr2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a,      \
+                    dependencies);                                                          \
+    }
+
+HPR2_LAUNCHER_USM(std::complex<float>, cublasChpr2)
+HPR2_LAUNCHER_USM(std::complex<double>, cublasZhpr2)
+
+#undef HPR2_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                            int64_t k, T alpha, const T *a, int64_t lda, const T *x, int64_t incx,
+                            T beta, T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
+    cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,       \
+                         TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, int64_t incx,  \
+                         TYPE beta, TYPE *y, int64_t incy,                                     \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {        \
+        return sbmv(CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \
+                    incy, dependencies);                                                       \
+    }
+
+SBMV_LAUNCHER_USM(float, cublasSsbmv)
+SBMV_LAUNCHER_USM(double, cublasDsbmv)
+
+#undef SBMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
+                            int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SYMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
+    cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,         \
+                         const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta,      \
+                         TYPE *y, int64_t incy,                                                   \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return symv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \
+                    dependencies);                                                                \
+    }
+
+SYMV_LAUNCHER_USM(float, cublasSsymv)
+SYMV_LAUNCHER_USM(double, cublasDsymv)
+
+#undef SYMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                           const T *x, int64_t incx, T *a, int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SYR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
+    cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,         \
+                        const TYPE *x, int64_t incx, TYPE *a, int64_t lda,                       \
+                        const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return syr(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \
+    }
+
+SYR_LAUNCHER_USM(float, cublasSsyr)
+SYR_LAUNCHER_USM(double, cublasDsyr)
+// Intel does not support the following two
+SYR_LAUNCHER_USM(std::complex<float>, cublasCsyr)
+SYR_LAUNCHER_USM(std::complex<double>, cublasZsyr)
+#undef SYR_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SYR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,   \
+                         const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \
+                         int64_t lda,                                                       \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {     \
+        return syr2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \
+                    dependencies);                                                          \
+    }
+
+SYR2_LAUNCHER_USM(float, cublasSsyr2)
+SYR2_LAUNCHER_USM(double, cublasDsyr2)
+// Intel does not support the following two
+SYR2_LAUNCHER_USM(std::complex<float>, cublasCsyr2)
+SYR2_LAUNCHER_USM(std::complex<double>, cublasZsyr2)
+
+#undef SYR2_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                              \
+    cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,    \
+                         const TYPE *a, const TYPE *x, int64_t incx, TYPE beta, TYPE *y,     \
+                         int64_t incy,                                                       \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {      \
+        return spmv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \
+                    dependencies);                                                           \
+    }
+
+SPMV_LAUNCHER_USM(float, cublasSspmv)
+SPMV_LAUNCHER_USM(double, cublasDspmv)
+
+#undef SPMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                           const T *x, int64_t incx, T *a,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SPR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                              \
+    cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,    \
+                        const TYPE *x, int64_t incx, TYPE *a,                               \
+                        const cl::sycl::vector_class<cl::sycl::event> &dependencies) {      \
+        return spr(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies); \
+    }
+
+SPR_LAUNCHER_USM(float, cublasSspr)
+SPR_LAUNCHER_USM(double, cublasDspr)
+
+#undef SPR_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
+                            const T *x, int64_t incx, const T *y, int64_t incy, T *a,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,   \
+                         const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {     \
+        return spr2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a,      \
+                    dependencies);                                                          \
+    }
+
+SPR2_LAUNCHER_USM(float, cublasSspr2)
+SPR2_LAUNCHER_USM(double, cublasDspr2)
+
+#undef SPR2_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x,
+                            int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
+    cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans,              \
+                         diag unit_diag, int64_t n, int64_t k, const TYPE *a, int64_t lda,       \
+                         TYPE *x, int64_t incx,                                                  \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {          \
+        return tbmv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \
+                    dependencies);                                                               \
+    }
+
+TBMV_LAUNCHER_USM(float, cublasStbmv)
+TBMV_LAUNCHER_USM(double, cublasDtbmv)
+TBMV_LAUNCHER_USM(std::complex<float>, cublasCtbmv)
+TBMV_LAUNCHER_USM(std::complex<double>, cublasZtbmv)
+
+#undef TBMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x,
+                            int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TBSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
+    cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans,              \
+                         diag unit_diag, int64_t n, int64_t k, const TYPE *a, int64_t lda,       \
+                         TYPE *x, int64_t incx,                                                  \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {          \
+        return tbsv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \
+                    dependencies);                                                               \
+    }
+
+TBSV_LAUNCHER_USM(float, cublasStbsv)
+TBSV_LAUNCHER_USM(double, cublasDtbsv)
+TBSV_LAUNCHER_USM(std::complex<float>, cublasCtbsv)
+TBSV_LAUNCHER_USM(std::complex<double>, cublasZtbsv)
+
+#undef TBSV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                           \
+    cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans,       \
+                         diag unit_diag, int64_t n, const TYPE *a, TYPE *x, int64_t incx, \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {   \
+        return tpmv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx,  \
+                    dependencies);                                                        \
+    }
+
+TPMV_LAUNCHER_USM(float, cublasStpmv)
+TPMV_LAUNCHER_USM(double, cublasDtpmv)
+TPMV_LAUNCHER_USM(std::complex<float>, cublasCtpmv)
+TPMV_LAUNCHER_USM(std::complex<double>, cublasZtpmv)
+
+#undef TPMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TPSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                           \
+    cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans,       \
+                         diag unit_diag, int64_t n, const TYPE *a, TYPE *x, int64_t incx, \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {   \
+        return tpsv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx,  \
+                    dependencies);                                                        \
+    }
+
+TPSV_LAUNCHER_USM(float, cublasStpsv)
+TPSV_LAUNCHER_USM(double, cublasDtpsv)
+TPSV_LAUNCHER_USM(std::complex<float>, cublasCtpsv)
+TPSV_LAUNCHER_USM(std::complex<double>, cublasZtpsv)
+
+#undef TPSV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TRMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                               \
+    cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans,           \
+                         diag unit_diag, int64_t n, const TYPE *a, int64_t lda, TYPE *x,      \
+                         int64_t incx,                                                        \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {       \
+        return trmv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \
+                    dependencies);                                                            \
+    }
+
+TRMV_LAUNCHER_USM(float, cublasStrmv)
+TRMV_LAUNCHER_USM(double, cublasDtrmv)
+TRMV_LAUNCHER_USM(std::complex<float>, cublasCtrmv)
+TRMV_LAUNCHER_USM(std::complex<double>, cublasZtrmv)
+
+#undef TRMV_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TRSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                               \
+    cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans,           \
+                         diag unit_diag, int64_t n, const TYPE *a, int64_t lda, TYPE *x,      \
+                         int64_t incx,                                                        \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {       \
+        return trsv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \
+                    dependencies);                                                            \
+    }
+
+TRSV_LAUNCHER_USM(float, cublasStrsv)
+TRSV_LAUNCHER_USM(double, cublasDtrsv)
+TRSV_LAUNCHER_USM(std::complex<float>, cublasCtrsv)
+TRSV_LAUNCHER_USM(std::complex<double>, cublasZtrsv)
+
+#undef TRSV_LAUNCHER_USM
+
 } // namespace cublas
 } // namespace onemkl
diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp
index 81f7b7a91..0bb0c911b 100644
--- a/src/blas/backends/cublas/cublas_level3.cpp
+++ b/src/blas/backends/cublas/cublas_level3.cpp
@@ -19,10 +19,14 @@
 #include <CL/sycl/detail/pi.hpp>
 #include "cublas_helper.hpp"
 #include "cublas_scope_handle.hpp"
+#include "include/exceptions_helper.hpp"
 #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
 namespace onemkl {
 namespace cublas {
+
+// Buffer APIs
+
 template <typename Func, typename T>
 inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
                  int64_t n, int64_t k, T alpha, cl::sycl::buffer<T, 1> &a, int64_t lda,
@@ -375,5 +379,220 @@ TRSM_LAUNCHER(std::complex<float>, cublasCtrsm)
 TRSM_LAUNCHER(std::complex<double>, cublasZtrsm)
 
 #undef TRSM_LAUNCHER
+
+// USM APIs
+
+template <typename Func, typename T>
+inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose transb,
+                            int64_t m, int64_t n, int64_t k, T alpha, const T *a, int64_t lda,
+                            const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define GEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
+    cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,  \
+                         int64_t n, int64_t k, TYPE alpha, const TYPE *a, int64_t lda,           \
+                         const TYPE *b, int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,            \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {          \
+        return gemm(CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, \
+                    c, ldc, dependencies);                                                       \
+    }
+
+GEMM_LAUNCHER_USM(float, cublasSgemm)
+GEMM_LAUNCHER_USM(double, cublasDgemm)
+GEMM_LAUNCHER_USM(std::complex<float>, cublasCgemm)
+GEMM_LAUNCHER_USM(std::complex<double>, cublasZgemm)
+
+#undef GEMM_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                            int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *b,
+                            int64_t ldb, T beta, T *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SYMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
+    cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,   \
+                         int64_t n, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b,       \
+                         int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,                           \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {          \
+        return symm(CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \
+                    beta, c, ldc, dependencies);                                                 \
+    }
+
+SYMM_LAUNCHER_USM(float, cublasSsymm)
+SYMM_LAUNCHER_USM(double, cublasDsymm)
+SYMM_LAUNCHER_USM(std::complex<float>, cublasCsymm)
+SYMM_LAUNCHER_USM(std::complex<double>, cublasZsymm)
+
+#undef SYMM_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                            int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *b,
+                            int64_t ldb, T beta, T *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
+    cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,   \
+                         int64_t n, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b,       \
+                         int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,                           \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {          \
+        return hemm(CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \
+                    beta, c, ldc, dependencies);                                                 \
+    }
+HEMM_LAUNCHER_USM(std::complex<float>, cublasChemm)
+HEMM_LAUNCHER_USM(std::complex<double>, cublasZhemm)
+
+#undef HEMM_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            int64_t n, int64_t k, T alpha, const T *a, int64_t lda, T beta, T *c,
+                            int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SYRK_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
+    cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,    \
+                         int64_t k, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c,   \
+                         int64_t ldc,                                                             \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return syrk(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \
+                    dependencies);                                                                \
+    }
+
+SYRK_LAUNCHER_USM(float, cublasSsyrk)
+SYRK_LAUNCHER_USM(double, cublasDsyrk)
+SYRK_LAUNCHER_USM(std::complex<float>, cublasCsyrk)
+SYRK_LAUNCHER_USM(std::complex<double>, cublasZsyrk)
+
+#undef SYRK_LAUNCHER_USM
+
+template <typename Func, typename DataType, typename ScalarType>
+inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                            int64_t n, int64_t k, ScalarType alpha, const DataType *a, int64_t lda,
+                            ScalarType beta, DataType *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                 \
+    cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,    \
+                         int64_t k, SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda,           \
+                         SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,                             \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return herk(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \
+                    dependencies);                                                                \
+    }
+
+HERK_LAUNCHER_USM(std::complex<float>, float, cublasCherk)
+HERK_LAUNCHER_USM(std::complex<double>, double, cublasZherk)
+
+#undef HERK_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                             int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *b,
+                             int64_t ldb, T beta, T *c, int64_t ldc,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define SYR2K_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
+    cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,    \
+                          int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b,        \
+                          int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,                            \
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return syr2k(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, \
+                     c, ldc, dependencies);                                                        \
+    }
+SYR2K_LAUNCHER_USM(float, cublasSsyr2k)
+SYR2K_LAUNCHER_USM(double, cublasDsyr2k)
+SYR2K_LAUNCHER_USM(std::complex<float>, cublasCsyr2k)
+SYR2K_LAUNCHER_USM(std::complex<double>, cublasZsyr2k)
+
+#undef SYR2K_LAUNCHER_USM
+
+template <typename Func, typename DataType, typename ScalarType>
+inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                             int64_t n, int64_t k, DataType alpha, const DataType *a, int64_t lda,
+                             const DataType *b, int64_t ldb, ScalarType beta, DataType *c,
+                             int64_t ldc,
+                             const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                 \
+    cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,    \
+                          int64_t k, DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda,             \
+                          const DATA_TYPE *b, int64_t ldb, SCALAR_TYPE beta, DATA_TYPE *c,         \
+                          int64_t ldc,                                                             \
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies) {           \
+        return her2k(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, \
+                     c, ldc, dependencies);                                                        \
+    }
+
+HER2K_LAUNCHER_USM(std::complex<float>, float, cublasCher2k)
+HER2K_LAUNCHER_USM(std::complex<double>, double, cublasZher2k)
+
+#undef HER2K_LAUNCHER_USM
+
+// NOTE: In cublas TRMM diverted from the netlib blas and for performance
+// reason it requires the C matrix to be
+// separated from the B matrix. It is possible to use B instead of C, but this
+// will slow-down the code.
+template <typename Func, typename T>
+inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                            transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
+                            const T *a, int64_t lda, T *b, int64_t ldb,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TRMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
+    cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower,                \
+                         transpose trans, diag unit_diag, int64_t m, int64_t n, TYPE alpha,        \
+                         const TYPE *a, int64_t lda, TYPE *b, int64_t ldb,                         \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {            \
+        return trmm(CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \
+                    a, lda, b, ldb, dependencies);                                                 \
+    }
+TRMM_LAUNCHER_USM(float, cublasStrmm)
+TRMM_LAUNCHER_USM(double, cublasDtrmm)
+TRMM_LAUNCHER_USM(std::complex<float>, cublasCtrmm)
+TRMM_LAUNCHER_USM(std::complex<double>, cublasZtrmm)
+
+#undef TRMM_LAUNCHER_USM
+
+template <typename Func, typename T>
+inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                            transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
+                            const T *a, int64_t lda, T *b, int64_t ldb,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    throw backend_unsupported_exception();
+}
+
+#define TRSM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
+    cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower,                \
+                         transpose trans, diag unit_diag, int64_t m, int64_t n, TYPE alpha,        \
+                         const TYPE *a, int64_t lda, TYPE *b, int64_t ldb,                         \
+                         const cl::sycl::vector_class<cl::sycl::event> &dependencies) {            \
+        return trsm(CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \
+                    a, lda, b, ldb, dependencies);                                                 \
+    }
+TRSM_LAUNCHER_USM(float, cublasStrsm)
+TRSM_LAUNCHER_USM(double, cublasDtrsm)
+TRSM_LAUNCHER_USM(std::complex<float>, cublasCtrsm)
+TRSM_LAUNCHER_USM(std::complex<double>, cublasZtrsm)
+
+#undef TRSM_LAUNCHER_USM
+
 } // namespace cublas
 } // namespace onemkl
diff --git a/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp b/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp
index 2a6ee6dab..df55993b7 100644
--- a/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp
+++ b/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp
@@ -178,14 +178,6 @@ extern "C" function_table_t mkl_blas_table = {
     onemkl::cublas::gemm_batch,
     onemkl::cublas::gemm_batch,
     onemkl::cublas::gemm_batch,
-    onemkl::cublas::gemm_batch,
-    onemkl::cublas::gemm_batch,
-    onemkl::cublas::gemm_batch,
-    onemkl::cublas::gemm_batch,
-    onemkl::cublas::trsm_batch,
-    onemkl::cublas::trsm_batch,
-    onemkl::cublas::trsm_batch,
-    onemkl::cublas::trsm_batch,
     onemkl::cublas::trsm_batch,
     onemkl::cublas::trsm_batch,
     onemkl::cublas::trsm_batch,
@@ -201,4 +193,170 @@ extern "C" function_table_t mkl_blas_table = {
     onemkl::cublas::gemm_ext,
     onemkl::cublas::gemm_ext,
     onemkl::cublas::gemm_ext,
+    onemkl::cublas::asum,
+    onemkl::cublas::asum,
+    onemkl::cublas::asum,
+    onemkl::cublas::asum,
+    onemkl::cublas::axpy,
+    onemkl::cublas::axpy,
+    onemkl::cublas::axpy,
+    onemkl::cublas::axpy,
+    onemkl::cublas::axpy_batch,
+    onemkl::cublas::axpy_batch,
+    onemkl::cublas::axpy_batch,
+    onemkl::cublas::axpy_batch,
+    onemkl::cublas::copy,
+    onemkl::cublas::copy,
+    onemkl::cublas::copy,
+    onemkl::cublas::copy,
+    onemkl::cublas::dot,
+    onemkl::cublas::dot,
+    onemkl::cublas::dot,
+    onemkl::cublas::dotc,
+    onemkl::cublas::dotc,
+    onemkl::cublas::dotu,
+    onemkl::cublas::dotu,
+    onemkl::cublas::iamin,
+    onemkl::cublas::iamin,
+    onemkl::cublas::iamin,
+    onemkl::cublas::iamin,
+    onemkl::cublas::iamax,
+    onemkl::cublas::iamax,
+    onemkl::cublas::iamax,
+    onemkl::cublas::iamax,
+    onemkl::cublas::nrm2,
+    onemkl::cublas::nrm2,
+    onemkl::cublas::nrm2,
+    onemkl::cublas::nrm2,
+    onemkl::cublas::rot,
+    onemkl::cublas::rot,
+    onemkl::cublas::rot,
+    onemkl::cublas::rot,
+    onemkl::cublas::rotg,
+    onemkl::cublas::rotg,
+    onemkl::cublas::rotg,
+    onemkl::cublas::rotg,
+    onemkl::cublas::rotm,
+    onemkl::cublas::rotm,
+    onemkl::cublas::rotmg,
+    onemkl::cublas::rotmg,
+    onemkl::cublas::scal,
+    onemkl::cublas::scal,
+    onemkl::cublas::scal,
+    onemkl::cublas::scal,
+    onemkl::cublas::scal,
+    onemkl::cublas::scal,
+    onemkl::cublas::sdsdot,
+    onemkl::cublas::swap,
+    onemkl::cublas::swap,
+    onemkl::cublas::swap,
+    onemkl::cublas::swap,
+    onemkl::cublas::gbmv,
+    onemkl::cublas::gbmv,
+    onemkl::cublas::gbmv,
+    onemkl::cublas::gbmv,
+    onemkl::cublas::gemv,
+    onemkl::cublas::gemv,
+    onemkl::cublas::gemv,
+    onemkl::cublas::gemv,
+    onemkl::cublas::ger,
+    onemkl::cublas::ger,
+    onemkl::cublas::gerc,
+    onemkl::cublas::gerc,
+    onemkl::cublas::geru,
+    onemkl::cublas::geru,
+    onemkl::cublas::hbmv,
+    onemkl::cublas::hbmv,
+    onemkl::cublas::hemv,
+    onemkl::cublas::hemv,
+    onemkl::cublas::her,
+    onemkl::cublas::her,
+    onemkl::cublas::her2,
+    onemkl::cublas::her2,
+    onemkl::cublas::hpmv,
+    onemkl::cublas::hpmv,
+    onemkl::cublas::hpr,
+    onemkl::cublas::hpr,
+    onemkl::cublas::hpr2,
+    onemkl::cublas::hpr2,
+    onemkl::cublas::sbmv,
+    onemkl::cublas::sbmv,
+    onemkl::cublas::spmv,
+    onemkl::cublas::spmv,
+    onemkl::cublas::spr,
+    onemkl::cublas::spr,
+    onemkl::cublas::spr2,
+    onemkl::cublas::spr2,
+    onemkl::cublas::symv,
+    onemkl::cublas::symv,
+    onemkl::cublas::syr,
+    onemkl::cublas::syr,
+    onemkl::cublas::syr2,
+    onemkl::cublas::syr2,
+    onemkl::cublas::tbmv,
+    onemkl::cublas::tbmv,
+    onemkl::cublas::tbmv,
+    onemkl::cublas::tbmv,
+    onemkl::cublas::tbsv,
+    onemkl::cublas::tbsv,
+    onemkl::cublas::tbsv,
+    onemkl::cublas::tbsv,
+    onemkl::cublas::tpmv,
+    onemkl::cublas::tpmv,
+    onemkl::cublas::tpmv,
+    onemkl::cublas::tpmv,
+    onemkl::cublas::tpsv,
+    onemkl::cublas::tpsv,
+    onemkl::cublas::tpsv,
+    onemkl::cublas::tpsv,
+    onemkl::cublas::trmv,
+    onemkl::cublas::trmv,
+    onemkl::cublas::trmv,
+    onemkl::cublas::trmv,
+    onemkl::cublas::trsv,
+    onemkl::cublas::trsv,
+    onemkl::cublas::trsv,
+    onemkl::cublas::trsv,
+    onemkl::cublas::gemm,
+    onemkl::cublas::gemm,
+    onemkl::cublas::gemm,
+    onemkl::cublas::gemm,
+    onemkl::cublas::hemm,
+    onemkl::cublas::hemm,
+    onemkl::cublas::herk,
+    onemkl::cublas::herk,
+    onemkl::cublas::her2k,
+    onemkl::cublas::her2k,
+    onemkl::cublas::symm,
+    onemkl::cublas::symm,
+    onemkl::cublas::symm,
+    onemkl::cublas::symm,
+    onemkl::cublas::syrk,
+    onemkl::cublas::syrk,
+    onemkl::cublas::syrk,
+    onemkl::cublas::syrk,
+    onemkl::cublas::syr2k,
+    onemkl::cublas::syr2k,
+    onemkl::cublas::syr2k,
+    onemkl::cublas::syr2k,
+    onemkl::cublas::trmm,
+    onemkl::cublas::trmm,
+    onemkl::cublas::trmm,
+    onemkl::cublas::trmm,
+    onemkl::cublas::trsm,
+    onemkl::cublas::trsm,
+    onemkl::cublas::trsm,
+    onemkl::cublas::trsm,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemm_batch,
+    onemkl::cublas::gemmt,
+    onemkl::cublas::gemmt,
+    onemkl::cublas::gemmt,
+    onemkl::cublas::gemmt,
 };
diff --git a/src/blas/backends/mklcpu/cpu_batch.cpp b/src/blas/backends/mklcpu/cpu_batch.cpp
index 43b6670e4..78fc55cb5 100644
--- a/src/blas/backends/mklcpu/cpu_batch.cpp
+++ b/src/blas/backends/mklcpu/cpu_batch.cpp
@@ -25,398 +25,7 @@
 namespace onemkl {
 namespace mklcpu {
 
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<int64_t, 1> &k,
-                cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                cl::sycl::buffer<int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                cl::sycl::buffer<int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-                cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<int64_t, 1> &ldc,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>(cgh);
-        auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto k_acc          = k.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>(cgh);
-        auto c_acc          = c.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-
-        host_task<class mkl_kernel_init_sgemm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            float **a_array      = (float **)::malloc(sizeof(float *) * total_size);
-            float **b_array      = (float **)::malloc(sizeof(float *) * total_size);
-            float **c_array      = (float **)::malloc(sizeof(float *) * total_size);
-            MKL_INT *m_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *k_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldc_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0;
-            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
-            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                k_[i]          = k_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                ldc_[i]        = ldc_acc[i];
-                group_size_[i] = group_size_acc[i];
-                transa_[i]     = *fortran_char(transa_acc[i]);
-                transb_[i]     = *fortran_char(transb_acc[i]);
-
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                        c_array[0] = c_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                        c_array[idx] = c_array[idx - 1] + offset_c;
-                    }
-                    idx++;
-                    offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i]
-                                                                      : lda_acc[i] * m_acc[i];
-                    offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i]
-                                                                      : ldb_acc[i] * k_acc[i];
-                    offset_c = ldc_acc[i] * n_acc[i];
-                }
-            }
-
-            ::sgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(),
-                          (const float **)a_array, lda_, (const float **)b_array, ldb_,
-                          beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count,
-                          group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(c_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(k_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(ldc_);
-            ::free(group_size_);
-            ::free(transa_);
-            ::free(transb_);
-        });
-    });
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<int64_t, 1> &k,
-                cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-                cl::sycl::buffer<int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-                cl::sycl::buffer<int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-                cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<int64_t, 1> &ldc,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>(cgh);
-        auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto k_acc          = k.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>(cgh);
-        auto c_acc          = c.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-
-        host_task<class mkl_kernel_dgemm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            double **a_array     = (double **)::malloc(sizeof(double *) * total_size);
-            double **b_array     = (double **)::malloc(sizeof(double *) * total_size);
-            double **c_array     = (double **)::malloc(sizeof(double *) * total_size);
-            MKL_INT *m_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *k_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldc_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0;
-            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
-            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                k_[i]          = k_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                ldc_[i]        = ldc_acc[i];
-                group_size_[i] = group_size_acc[i];
-                transa_[i]     = *fortran_char(transa_acc[i]);
-                transb_[i]     = *fortran_char(transb_acc[i]);
-
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                        c_array[0] = c_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                        c_array[idx] = c_array[idx - 1] + offset_c;
-                    }
-                    idx++;
-                    offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i]
-                                                                      : lda_acc[i] * m_acc[i];
-                    offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i]
-                                                                      : ldb_acc[i] * k_acc[i];
-                    offset_c = ldc_acc[i] * n_acc[i];
-                }
-            }
-
-            ::dgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(),
-                          (const double **)a_array, lda_, (const double **)b_array, ldb_,
-                          beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count,
-                          group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(c_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(k_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(ldc_);
-            ::free(group_size_);
-            ::free(transa_);
-            ::free(transb_);
-        });
-    });
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<int64_t, 1> &k,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<int64_t, 1> &ldb,
-                cl::sycl::buffer<std::complex<float>, 1> &beta,
-                cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<int64_t, 1> &ldc,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>(cgh);
-        auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto k_acc          = k.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>(cgh);
-        auto c_acc          = c.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-
-        host_task<class mkl_kernel_cgemm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size);
-            MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size);
-            MKL_Complex8 **c_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size);
-            MKL_INT *m_            = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_            = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *k_            = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldc_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_   = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0;
-            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
-            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                k_[i]          = k_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                ldc_[i]        = ldc_acc[i];
-                group_size_[i] = group_size_acc[i];
-                transa_[i]     = *fortran_char(transa_acc[i]);
-                transb_[i]     = *fortran_char(transb_acc[i]);
-
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                        c_array[0] = c_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                        c_array[idx] = c_array[idx - 1] + offset_c;
-                    }
-                    idx++;
-                    offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i]
-                                                                      : lda_acc[i] * m_acc[i];
-                    offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i]
-                                                                      : ldb_acc[i] * k_acc[i];
-                    offset_c = ldc_acc[i] * n_acc[i];
-                }
-            }
-
-            ::cgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(),
-                          (const MKL_Complex8 **)a_array, lda_, (const MKL_Complex8 **)b_array,
-                          ldb_, beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count,
-                          group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(c_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(k_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(ldc_);
-            ::free(group_size_);
-            ::free(transa_);
-            ::free(transb_);
-        });
-    });
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<int64_t, 1> &k,
-                cl::sycl::buffer<std::complex<double>, 1> &alpha,
-                cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<int64_t, 1> &ldb,
-                cl::sycl::buffer<std::complex<double>, 1> &beta,
-                cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<int64_t, 1> &ldc,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>(cgh);
-        auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto k_acc          = k.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>(cgh);
-        auto c_acc          = c.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-
-        host_task<class mkl_kernel_zgemm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            MKL_Complex16 **a_array =
-                (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size);
-            MKL_Complex16 **b_array =
-                (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size);
-            MKL_Complex16 **c_array =
-                (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size);
-            MKL_INT *m_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *k_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldc_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0;
-            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
-            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                k_[i]          = k_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                ldc_[i]        = ldc_acc[i];
-                group_size_[i] = group_size_acc[i];
-                transa_[i]     = *fortran_char(transa_acc[i]);
-                transb_[i]     = *fortran_char(transb_acc[i]);
-
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                        c_array[0] = c_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                        c_array[idx] = c_array[idx - 1] + offset_c;
-                    }
-                    idx++;
-                    offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i]
-                                                                      : lda_acc[i] * m_acc[i];
-                    offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i]
-                                                                      : ldb_acc[i] * k_acc[i];
-                    offset_c = ldc_acc[i] * n_acc[i];
-                }
-            }
-
-            ::zgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(),
-                          (const MKL_Complex16 **)a_array, lda_, (const MKL_Complex16 **)b_array,
-                          ldb_, beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count,
-                          group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(c_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(k_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(ldc_);
-            ::free(group_size_);
-            ::free(transa_);
-            ::free(transb_);
-        });
-    });
-}
+// Buffer APIs
 
 void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
                 int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
@@ -435,6 +44,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6
             float **a_array = (float **)::malloc(sizeof(float *) * batch_size);
             float **b_array = (float **)::malloc(sizeof(float *) * batch_size);
             float **c_array = (float **)::malloc(sizeof(float *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -479,6 +95,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6
             double **a_array = (double **)::malloc(sizeof(double *) * batch_size);
             double **b_array = (double **)::malloc(sizeof(double *) * batch_size);
             double **c_array = (double **)::malloc(sizeof(double *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -524,6 +147,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6
             MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size);
             MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size);
             MKL_Complex8 **c_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -572,6 +202,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6
                 (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size);
             MKL_Complex16 **c_array =
                 (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -599,92 +236,6 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6
     });
 }
 
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<int64_t, 1> &ldb,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>(cgh);
-        auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>(cgh);
-        auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>(cgh);
-        auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-        host_task<class mkl_kernel_init_strsm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            float **a_array      = (float **)::malloc(sizeof(float *) * total_size);
-            float **b_array      = (float **)::malloc(sizeof(float *) * total_size);
-            MKL_INT *m_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, idx = 0;
-            char *side_  = (char *)::malloc(sizeof(char) * group_count);
-            char *uplo_  = (char *)::malloc(sizeof(char) * group_count);
-            char *trans_ = (char *)::malloc(sizeof(char) * group_count);
-            char *diag_  = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                group_size_[i] = group_size_acc[i];
-                trans_[i]      = *fortran_char(trans_acc[i]);
-                side_[i]       = *fortran_char(side_acc[i]);
-                uplo_[i]       = *fortran_char(uplo_acc[i]);
-                diag_[i]       = *fortran_char(diag_acc[i]);
-
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                    }
-                    idx++;
-                    offset_a =
-                        (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-                    offset_b = ldb_acc[i] * n_acc[i];
-                }
-            }
-
-            ::strsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(),
-                          (const float **)a_array, lda_, (float **)b_array, ldb_,
-                          (MKL_INT *)&group_count, group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(group_size_);
-            ::free(side_);
-            ::free(uplo_);
-            ::free(trans_);
-            ::free(diag_);
-        });
-    });
-}
-
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
                 diag unit_diag, int64_t m, int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
                 int64_t lda, int64_t stride_a, cl::sycl::buffer<float, 1> &b, int64_t ldb,
@@ -701,6 +252,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
         host_task<class mkl_kernel_init_strsm_batch_stride>(cgh, [=]() {
             float **a_array = (float **)::malloc(sizeof(float *) * batch_size);
             float **b_array = (float **)::malloc(sizeof(float *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -724,92 +281,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
     });
 }
 
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<int64_t, 1> &ldb,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>(cgh);
-        auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>(cgh);
-        auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>(cgh);
-        auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-        host_task<class mkl_kernel_init_dtrsm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            double **a_array     = (double **)::malloc(sizeof(double *) * total_size);
-            double **b_array     = (double **)::malloc(sizeof(double *) * total_size);
-            MKL_INT *m_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, idx = 0;
-            char *side_  = (char *)::malloc(sizeof(char) * group_count);
-            char *uplo_  = (char *)::malloc(sizeof(char) * group_count);
-            char *trans_ = (char *)::malloc(sizeof(char) * group_count);
-            char *diag_  = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                group_size_[i] = group_size_acc[i];
-                trans_[i]      = *fortran_char(trans_acc[i]);
-                side_[i]       = *fortran_char(side_acc[i]);
-                uplo_[i]       = *fortran_char(uplo_acc[i]);
-                diag_[i]       = *fortran_char(diag_acc[i]);
-
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                    }
-                    idx++;
-                    offset_a =
-                        (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-                    offset_b = ldb_acc[i] * n_acc[i];
-                }
-            }
-
-            ::dtrsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(),
-                          (const double **)a_array, lda_, (double **)b_array, ldb_,
-                          (MKL_INT *)&group_count, group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(group_size_);
-            ::free(side_);
-            ::free(uplo_);
-            ::free(trans_);
-            ::free(diag_);
-        });
-    });
-}
-
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
                 diag unit_diag, int64_t m, int64_t n, double alpha, cl::sycl::buffer<double, 1> &a,
                 int64_t lda, int64_t stride_a, cl::sycl::buffer<double, 1> &b, int64_t ldb,
@@ -826,6 +297,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
         host_task<class mkl_kernel_init_dtrsm_batch_stride>(cgh, [=]() {
             double **a_array = (double **)::malloc(sizeof(double *) * batch_size);
             double **b_array = (double **)::malloc(sizeof(double *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -849,92 +326,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
     });
 }
 
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<int64_t, 1> &ldb,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>(cgh);
-        auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>(cgh);
-        auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>(cgh);
-        auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-        host_task<class mkl_kernel_init_ctrsm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size);
-            MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size);
-            MKL_INT *m_            = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_            = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_   = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, idx = 0;
-            char *side_  = (char *)::malloc(sizeof(char) * group_count);
-            char *uplo_  = (char *)::malloc(sizeof(char) * group_count);
-            char *trans_ = (char *)::malloc(sizeof(char) * group_count);
-            char *diag_  = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                group_size_[i] = group_size_acc[i];
-                trans_[i]      = *fortran_char(trans_acc[i]);
-                side_[i]       = *fortran_char(side_acc[i]);
-                uplo_[i]       = *fortran_char(uplo_acc[i]);
-                diag_[i]       = *fortran_char(diag_acc[i]);
-
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                    }
-                    idx++;
-                    offset_a =
-                        (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-                    offset_b = ldb_acc[i] * n_acc[i];
-                }
-            }
-
-            ::ctrsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(),
-                          (const MKL_Complex8 **)a_array, lda_, (MKL_Complex8 **)b_array, ldb_,
-                          (MKL_INT *)&group_count, group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(group_size_);
-            ::free(side_);
-            ::free(uplo_);
-            ::free(trans_);
-            ::free(diag_);
-        });
-    });
-}
-
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
                 diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
                 cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
@@ -952,6 +343,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
         host_task<class mkl_kernel_init_ctrsm_batch_stride>(cgh, [=]() {
             MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size);
             MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -975,94 +372,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
     });
 }
 
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<int64_t, 1> &m,
-                cl::sycl::buffer<int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-                cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<int64_t, 1> &ldb,
-                int64_t group_count, cl::sycl::buffer<int64_t, 1> &group_size) {
-    queue.submit([&](cl::sycl::handler &cgh) {
-        auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>(cgh);
-        auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>(cgh);
-        auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>(cgh);
-        auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>(cgh);
-        auto m_acc          = m.get_access<cl::sycl::access::mode::read>(cgh);
-        auto n_acc          = n.get_access<cl::sycl::access::mode::read>(cgh);
-        auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>(cgh);
-        auto a_acc          = a.get_access<cl::sycl::access::mode::read>(cgh);
-        auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>(cgh);
-        auto b_acc          = b.get_access<cl::sycl::access::mode::read_write>(cgh);
-        auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>(cgh);
-        auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>(cgh);
-
-        host_task<class mkl_kernel_init_ztrsm_batch>(cgh, [=]() {
-            int64_t total_size = 0;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                total_size += group_size_acc[i];
-            }
-
-            MKL_Complex16 **a_array =
-                (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size);
-            MKL_Complex16 **b_array =
-                (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size);
-            MKL_INT *m_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *n_          = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *lda_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *ldb_        = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count);
-            int64_t offset_a = 0, offset_b = 0, idx = 0;
-            char *side_  = (char *)::malloc(sizeof(char) * group_count);
-            char *uplo_  = (char *)::malloc(sizeof(char) * group_count);
-            char *trans_ = (char *)::malloc(sizeof(char) * group_count);
-            char *diag_  = (char *)::malloc(sizeof(char) * group_count);
-
-            for (int64_t i = 0; i < group_count; i++) {
-                m_[i]          = m_acc[i];
-                n_[i]          = n_acc[i];
-                lda_[i]        = lda_acc[i];
-                ldb_[i]        = ldb_acc[i];
-                group_size_[i] = group_size_acc[i];
-                trans_[i]      = *fortran_char(trans_acc[i]);
-                side_[i]       = *fortran_char(side_acc[i]);
-                uplo_[i]       = *fortran_char(uplo_acc[i]);
-                diag_[i]       = *fortran_char(diag_acc[i]);
-                for (int64_t j = 0; j < group_size_acc[i]; j++) {
-                    if (idx == 0) {
-                        a_array[0] = a_acc.get_pointer();
-                        b_array[0] = b_acc.get_pointer();
-                    }
-                    else {
-                        a_array[idx] = a_array[idx - 1] + offset_a;
-                        b_array[idx] = b_array[idx - 1] + offset_b;
-                    }
-                    idx++;
-                    offset_a =
-                        (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-                    offset_b = ldb_acc[i] * n_acc[i];
-                }
-            }
-
-            ::ztrsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(),
-                          (const MKL_Complex16 **)a_array, lda_, (MKL_Complex16 **)b_array, ldb_,
-                          (MKL_INT *)&group_count, group_size_);
-
-            ::free(a_array);
-            ::free(b_array);
-            ::free(m_);
-            ::free(n_);
-            ::free(lda_);
-            ::free(ldb_);
-            ::free(group_size_);
-            ::free(side_);
-            ::free(uplo_);
-            ::free(trans_);
-            ::free(diag_);
-        });
-    });
-}
-
 void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
                 diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
                 cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
@@ -1081,6 +390,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
                 (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size);
             MKL_Complex16 **b_array =
                 (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                return;
+            }
 
             for (int64_t i = 0; i < batch_size; i++) {
                 if (i == 0) {
@@ -1104,5 +419,458 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans
     });
 }
 
+// USM APIs
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda,
+                           const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
+                           int64_t group_count, int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_usm_sgemm>(cgh, [=]() {
+            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
+            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
+            if ((transa_ == NULL) || (transb_ == NULL)) {
+                std::cout << "Error cannot allocate trans arrays\n";
+                ::free(transa_);
+                ::free(transb_);
+                return;
+            }
+            for (int64_t i = 0; i < group_count; i++) {
+                transa_[i] = *fortran_char(transa[i]);
+                transb_[i] = *fortran_char(transb[i]);
+            }
+            ::sgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n,
+                          (const MKL_INT *)k, alpha, (const float **)a, (const MKL_INT *)lda,
+                          (const float **)b, (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc,
+                          (const MKL_INT *)&group_count, (const MKL_INT *)group_size);
+            ::free(transa_);
+            ::free(transb_);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda,
+                           const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc,
+                           int64_t group_count, int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dgemm_batch_usm>(cgh, [=]() {
+            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
+            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
+            if ((transa_ == NULL) || (transb_ == NULL)) {
+                std::cout << "Error cannot allocate trans arrays\n";
+                ::free(transa_);
+                ::free(transb_);
+                return;
+            }
+            for (int64_t i = 0; i < group_count; i++) {
+                transa_[i] = *fortran_char(transa[i]);
+                transb_[i] = *fortran_char(transb[i]);
+            }
+            ::dgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n,
+                          (const MKL_INT *)k, alpha, (const double **)a, (const MKL_INT *)lda,
+                          (const double **)b, (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc,
+                          (const MKL_INT *)&group_count, (const MKL_INT *)group_size);
+            ::free(transa_);
+            ::free(transb_);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, std::complex<float> *alpha,
+                           const std::complex<float> **a, int64_t *lda,
+                           const std::complex<float> **b, int64_t *ldb, std::complex<float> *beta,
+                           std::complex<float> **c, int64_t *ldc, int64_t group_count,
+                           int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_cgemm_batch_usm>(cgh, [=]() {
+            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
+            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
+            if ((transa_ == NULL) || (transb_ == NULL)) {
+                std::cout << "Error cannot allocate trans arrays\n";
+                ::free(transa_);
+                ::free(transb_);
+                return;
+            }
+            for (int64_t i = 0; i < group_count; i++) {
+                transa_[i] = *fortran_char(transa[i]);
+                transb_[i] = *fortran_char(transb[i]);
+            }
+            ::cgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n,
+                          (const MKL_INT *)k, alpha, (const std::complex<float> **)a,
+                          (const MKL_INT *)lda, (const std::complex<float> **)b,
+                          (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc,
+                          (const MKL_INT *)&group_count, (const MKL_INT *)group_size);
+            ::free(transa_);
+            ::free(transb_);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
+                           int64_t *n, int64_t *k, std::complex<double> *alpha,
+                           const std::complex<double> **a, int64_t *lda,
+                           const std::complex<double> **b, int64_t *ldb, std::complex<double> *beta,
+                           std::complex<double> **c, int64_t *ldc, int64_t group_count,
+                           int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zgemm_batch_usm>(cgh, [=]() {
+            char *transa_ = (char *)::malloc(sizeof(char) * group_count);
+            char *transb_ = (char *)::malloc(sizeof(char) * group_count);
+            if ((transa_ == NULL) || (transb_ == NULL)) {
+                std::cout << "Error cannot allocate trans arrays\n";
+                ::free(transa_);
+                ::free(transb_);
+                return;
+            }
+            for (int64_t i = 0; i < group_count; i++) {
+                transa_[i] = *fortran_char(transa[i]);
+                transb_[i] = *fortran_char(transb[i]);
+            }
+            ::zgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n,
+                          (const MKL_INT *)k, alpha, (const std::complex<double> **)a,
+                          (const MKL_INT *)lda, (const std::complex<double> **)b,
+                          (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc,
+                          (const MKL_INT *)&group_count, (const MKL_INT *)group_size);
+            ::free(transa_);
+            ::free(transb_);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
+                           int64_t stride_a, const float *b, int64_t ldb, int64_t stride_b,
+                           float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        MKL_INT one        = 1;
+        host_task<class mkl_kernel_sgemm_batch_usm>(cgh, [=]() {
+            float **a_array = (float **)::malloc(sizeof(float *) * batch_size);
+            float **b_array = (float **)::malloc(sizeof(float *) * batch_size);
+            float **c_array = (float **)::malloc(sizeof(float *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
+            for (int64_t i = 0; i < batch_size; i++) {
+                if (i == 0) {
+                    a_array[0] = (float *)a;
+                    b_array[0] = (float *)b;
+                    c_array[0] = (float *)c;
+                }
+                else {
+                    a_array[i] = a_array[i - 1] + stride_a;
+                    b_array[i] = b_array[i - 1] + stride_b;
+                    c_array[i] = c_array[i - 1] + stride_c;
+                }
+            }
+            ::sgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                          (const MKL_INT *)&k, &alpha, (const float **)a_array,
+                          (const MKL_INT *)&lda, (const float **)b_array, (const MKL_INT *)&ldb,
+                          &beta, c_array, (const MKL_INT *)&ldc, (const MKL_INT *)&one,
+                          (const MKL_INT *)&batch_size);
+
+            ::free(a_array);
+            ::free(b_array);
+            ::free(c_array);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
+                           int64_t stride_a, const double *b, int64_t ldb, int64_t stride_b,
+                           double beta, double *c, int64_t ldc, int64_t stride_c,
+                           int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        MKL_INT one        = 1;
+        host_task<class mkl_kernel_dgemm_batch_usm>(cgh, [=]() {
+            double **a_array = (double **)::malloc(sizeof(double *) * batch_size);
+            double **b_array = (double **)::malloc(sizeof(double *) * batch_size);
+            double **c_array = (double **)::malloc(sizeof(double *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
+            for (int64_t i = 0; i < batch_size; i++) {
+                if (i == 0) {
+                    a_array[0] = (double *)a;
+                    b_array[0] = (double *)b;
+                    c_array[0] = (double *)c;
+                }
+                else {
+                    a_array[i] = a_array[i - 1] + stride_a;
+                    b_array[i] = b_array[i - 1] + stride_b;
+                    c_array[i] = c_array[i - 1] + stride_c;
+                }
+            }
+            ::dgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                          (const MKL_INT *)&k, &alpha, (const double **)a_array,
+                          (const MKL_INT *)&lda, (const double **)b_array, (const MKL_INT *)&ldb,
+                          &beta, c_array, (const MKL_INT *)&ldc, (const MKL_INT *)&one,
+                          (const MKL_INT *)&batch_size);
+
+            ::free(a_array);
+            ::free(b_array);
+            ::free(c_array);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, std::complex<float> alpha,
+                           const std::complex<float> *a, int64_t lda, int64_t stride_a,
+                           const std::complex<float> *b, int64_t ldb, int64_t stride_b,
+                           std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                           int64_t stride_c, int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        MKL_INT one        = 1;
+        host_task<class mkl_kernel_cgemm_batch_usm>(cgh, [=]() {
+            std::complex<float> **a_array =
+                (std::complex<float> **)::malloc(sizeof(std::complex<float> *) * batch_size);
+            std::complex<float> **b_array =
+                (std::complex<float> **)::malloc(sizeof(std::complex<float> *) * batch_size);
+            std::complex<float> **c_array =
+                (std::complex<float> **)::malloc(sizeof(std::complex<float> *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
+            for (int64_t i = 0; i < batch_size; i++) {
+                if (i == 0) {
+                    a_array[0] = (std::complex<float> *)a;
+                    b_array[0] = (std::complex<float> *)b;
+                    c_array[0] = (std::complex<float> *)c;
+                }
+                else {
+                    a_array[i] = a_array[i - 1] + stride_a;
+                    b_array[i] = b_array[i - 1] + stride_b;
+                    c_array[i] = c_array[i - 1] + stride_c;
+                }
+            }
+            ::cgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                          (const MKL_INT *)&k, &alpha, (const std::complex<float> **)a_array,
+                          (const MKL_INT *)&lda, (const std::complex<float> **)b_array,
+                          (const MKL_INT *)&ldb, &beta, c_array, (const MKL_INT *)&ldc,
+                          (const MKL_INT *)&one, (const MKL_INT *)&batch_size);
+
+            ::free(a_array);
+            ::free(b_array);
+            ::free(c_array);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                           int64_t n, int64_t k, std::complex<double> alpha,
+                           const std::complex<double> *a, int64_t lda, int64_t stride_a,
+                           const std::complex<double> *b, int64_t ldb, int64_t stride_b,
+                           std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                           int64_t stride_c, int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        MKL_INT one        = 1;
+        host_task<class mkl_kernel_zgemm_batch_usm>(cgh, [=]() {
+            std::complex<double> **a_array =
+                (std::complex<double> **)::malloc(sizeof(std::complex<double> *) * batch_size);
+            std::complex<double> **b_array =
+                (std::complex<double> **)::malloc(sizeof(std::complex<double> *) * batch_size);
+            std::complex<double> **c_array =
+                (std::complex<double> **)::malloc(sizeof(std::complex<double> *) * batch_size);
+            if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) {
+                std::cout << "Error cannot allocate input arrays\n";
+                ::free(a_array);
+                ::free(b_array);
+                ::free(c_array);
+                return;
+            }
+            for (int64_t i = 0; i < batch_size; i++) {
+                if (i == 0) {
+                    a_array[0] = (std::complex<double> *)a;
+                    b_array[0] = (std::complex<double> *)b;
+                    c_array[0] = (std::complex<double> *)c;
+                }
+                else {
+                    a_array[i] = a_array[i - 1] + stride_a;
+                    b_array[i] = b_array[i - 1] + stride_b;
+                    c_array[i] = c_array[i - 1] + stride_c;
+                }
+            }
+            ::zgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                          (const MKL_INT *)&k, &alpha, (const std::complex<double> **)a_array,
+                          (const MKL_INT *)&lda, (const std::complex<double> **)b_array,
+                          (const MKL_INT *)&ldb, &beta, c_array, (const MKL_INT *)&ldc,
+                          (const MKL_INT *)&one, (const MKL_INT *)&batch_size);
+
+            ::free(a_array);
+            ::free(b_array);
+            ::free(c_array);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, float *alpha, const float **x,
+                           int64_t *incx, float **y, int64_t *incy, int64_t group_count,
+                           int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_saxpy_batch_usm>(cgh, [=]() {
+            int64_t offset = 0;
+            for (int64_t i = 0; i < group_count; i++) {
+                for (int64_t j = 0; j < group_size[i]; j++) {
+                    ::saxpy((const MKL_INT *)(n + i), (const float *)(alpha + i), x[offset + j],
+                            (const MKL_INT *)(incx + i), y[offset + j],
+                            (const MKL_INT *)(incy + i));
+                }
+                offset += group_size[i];
+            }
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, double *alpha, const double **x,
+                           int64_t *incx, double **y, int64_t *incy, int64_t group_count,
+                           int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_daxpy_batch_usm>(cgh, [=]() {
+            int64_t offset = 0;
+            for (int64_t i = 0; i < group_count; i++) {
+                for (int64_t j = 0; j < group_size[i]; j++) {
+                    ::daxpy((const MKL_INT *)(n + i), (const double *)(alpha + i), x[offset + j],
+                            (const MKL_INT *)(incx + i), y[offset + j],
+                            (const MKL_INT *)(incy + i));
+                }
+                offset += group_size[i];
+            }
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex<float> *alpha,
+                           const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
+                           int64_t *incy, int64_t group_count, int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_caxpy_batch_usm>(cgh, [=]() {
+            int64_t offset = 0;
+            for (int64_t i = 0; i < group_count; i++) {
+                for (int64_t j = 0; j < group_size[i]; j++) {
+                    MKL_Complex8 alpha_ = { alpha[i].real(), alpha[i].imag() };
+                    ::caxpy((const MKL_INT *)(n + i), (const MKL_Complex8 *)&alpha_, x[offset + j],
+                            (const MKL_INT *)(incx + i), y[offset + j],
+                            (const MKL_INT *)(incy + i));
+                }
+                offset += group_size[i];
+            }
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex<double> *alpha,
+                           const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
+                           int64_t *incy, int64_t group_count, int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zaxpy_batch_usm>(cgh, [=]() {
+            int64_t offset = 0;
+            for (int64_t i = 0; i < group_count; i++) {
+                for (int64_t j = 0; j < group_size[i]; j++) {
+                    MKL_Complex16 alpha_ = { alpha[i].real(), alpha[i].imag() };
+                    ::zaxpy((const MKL_INT *)(n + i), (const MKL_Complex16 *)&alpha_, x[offset + j],
+                            (const MKL_INT *)(incx + i), y[offset + j],
+                            (const MKL_INT *)(incy + i));
+                }
+                offset += group_size[i];
+            }
+        });
+    });
+    return done;
+}
+
 } // namespace mklcpu
 } // namespace onemkl
diff --git a/src/blas/backends/mklcpu/cpu_extensions.cpp b/src/blas/backends/mklcpu/cpu_extensions.cpp
index dc0d557bb..378b8abc8 100644
--- a/src/blas/backends/mklcpu/cpu_extensions.cpp
+++ b/src/blas/backends/mklcpu/cpu_extensions.cpp
@@ -79,6 +79,8 @@ static inline void copy_mat(T_src &src, int64_t row, int64_t col, int64_t ld, of
     }
 }
 
+// Buffer APIs
+
 void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
           int64_t k, half alpha, cl::sycl::buffer<half, 1> &a, int64_t lda,
           cl::sycl::buffer<half, 1> &b, int64_t ldb, half beta, cl::sycl::buffer<half, 1> &c,
@@ -309,5 +311,105 @@ void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose
     });
 }
 
+// USM APIs
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
+                      const float *b, int64_t ldb, float beta, float *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char transb_      = *fortran_char(transb);
+        host_task<class mkl_kernel_sgemmt_usm>(cgh, [=]() {
+            ::sgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_,
+                     (const MKL_INT *)&n, (const MKL_INT *)&k, (const float *)&alpha, a,
+                     (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const float *)&beta, c,
+                     (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
+                      const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char transb_      = *fortran_char(transb);
+        host_task<class mkl_kernel_dgemmt_usm>(cgh, [=]() {
+            ::dgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_,
+                     (const MKL_INT *)&n, (const MKL_INT *)&k, (const double *)&alpha, a,
+                     (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const double *)&beta, c,
+                     (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                      int64_t lda, const std::complex<float> *b, int64_t ldb,
+                      std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char transb_      = *fortran_char(transb);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_cgemmt_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::cgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_,
+                     (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a,
+                     (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_,
+                     c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      int64_t n, int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, int64_t lda, const std::complex<double> *b,
+                      int64_t ldb, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char transb_      = *fortran_char(transb);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zgemmt_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_,
+                     (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a,
+                     (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_,
+                     c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
 } // namespace mklcpu
 } // namespace onemkl
diff --git a/src/blas/backends/mklcpu/cpu_level1.cpp b/src/blas/backends/mklcpu/cpu_level1.cpp
index a20f6d2ec..205e6601d 100644
--- a/src/blas/backends/mklcpu/cpu_level1.cpp
+++ b/src/blas/backends/mklcpu/cpu_level1.cpp
@@ -25,6 +25,8 @@
 namespace onemkl {
 namespace mklcpu {
 
+// Buffer APIs
+
 void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
           cl::sycl::buffer<float, 1> &result) {
     queue.submit([&](cl::sycl::handler &cgh) {
@@ -712,5 +714,823 @@ void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<doubl
     });
 }
 
+// USM APIs
+
+cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_sasum_usm>(cgh, [=]() {
+            result[0] = ::sasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dasum_usm>(cgh, [=]() {
+            result[0] = ::dasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
+                     float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_scasum_usm>(cgh, [=]() {
+            result[0] = ::scasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dzasum_usm>(cgh, [=]() {
+            result[0] = ::dzasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
+                     float *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_saxpy_usm>(cgh, [=]() {
+            ::saxpy((const MKL_INT *)&n, (const float *)&alpha, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
+                     double *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_daxpy_usm>(cgh, [=]() {
+            ::daxpy((const MKL_INT *)&n, (const double *)&alpha, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, int64_t incx, std::complex<float> *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_caxpy_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::caxpy((const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x, (const MKL_INT *)&incx,
+                    y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *x, int64_t incx, std::complex<double> *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_zaxpy_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::zaxpy((const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx,
+                    y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_scopy_usm>(cgh, [=]() {
+            ::scopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dcopy_usm>(cgh, [=]() {
+            ::dcopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
+                     std::complex<float> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_ccopy_usm>(cgh, [=]() {
+            ::ccopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
+                     std::complex<double> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zcopy_usm>(cgh, [=]() {
+            ::zcopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
+                    int64_t incy, float *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_sdot_usm>(cgh, [=]() {
+            result[0] =
+                ::sdot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx,
+                    const double *y, int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_ddot_usm>(cgh, [=]() {
+            result[0] =
+                ::ddot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
+                    int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dsdot_usm>(cgh, [=]() {
+            result[0] =
+                ::dsdot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event dotc(cl::sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
+                     const std::complex<float> *y, int64_t incy, std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_cdotc_usm>(cgh, [=]() {
+            ::cdotc(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event dotc(cl::sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
+                     const std::complex<double> *y, int64_t incy, std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zdotc_usm>(cgh, [=]() {
+            ::zdotc(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event dotu(cl::sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
+                     const std::complex<float> *y, int64_t incy, std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_cdotu_usm>(cgh, [=]() {
+            ::cdotu(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event dotu(cl::sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
+                     const std::complex<double> *y, int64_t incy, std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zdotu_usm>(cgh, [=]() {
+            ::zdotu(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx,
+                      int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_isamin_usm>(cgh, [=]() {
+            result[0] = ::cblas_isamin((MKL_INT)n, x, (MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx,
+                      int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_idamin_usm>(cgh, [=]() {
+            result[0] = ::cblas_idamin((const MKL_INT)n, x, (const MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
+                      int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_icamin_usm>(cgh, [=]() {
+            result[0] = ::cblas_icamin((MKL_INT)n, x, (MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const std::complex<double> *x,
+                      int64_t incx, int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_izamin_usm>(cgh, [=]() {
+            result[0] = ::cblas_izamin((MKL_INT)n, x, (MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx,
+                      int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_isamax_usm>(cgh, [=]() {
+            result[0] = ::cblas_isamax((MKL_INT)n, x, (MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx,
+                      int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_idamax_usm>(cgh, [=]() {
+            result[0] = ::cblas_idamax((MKL_INT)n, x, (MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
+                      int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_icamax_usm>(cgh, [=]() {
+            result[0] = ::cblas_icamax((MKL_INT)n, x, (MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const std::complex<double> *x,
+                      int64_t incx, int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_izamax_usm>(cgh, [=]() {
+            result[0] = ::cblas_izamax((MKL_INT)n, x, (MKL_INT)incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_snrm2_usm>(cgh, [=]() {
+            result[0] = ::snrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dnrm2_usm>(cgh, [=]() {
+            result[0] = ::dnrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
+                     float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_scnrm2_usm>(cgh, [=]() {
+            result[0] = ::scnrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dznrm2_usm>(cgh, [=]() {
+            result[0] = ::dznrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y,
+                    int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_srot_usm>(cgh, [=]() {
+            ::srot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c,
+                   &s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y,
+                    int64_t incy, double c, double s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_drot_usm>(cgh, [=]() {
+            ::drot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c,
+                   &s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, std::complex<float> *x, int64_t incx,
+                    std::complex<float> *y, int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_csrot_usm>(cgh, [=]() {
+            ::csrot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c,
+                    &s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, std::complex<double> *x, int64_t incx,
+                    std::complex<double> *y, int64_t incy, double c, double s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zdrot_usm>(cgh, [=]() {
+            ::zdrot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c,
+                    &s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_srotg_usm>(cgh, [=]() {
+            ::srotg(a, b, c, s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_drotg_usm>(cgh, [=]() {
+            ::drotg(a, b, c, s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b,
+                     float *c, std::complex<float> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_crotg_usm>(cgh, [=]() {
+            ::crotg(a, b, c, s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b,
+                     double *c, std::complex<double> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zrotg_usm>(cgh, [=]() {
+            ::zrotg(a, b, c, s);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotm(cl::sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y,
+                     int64_t incy, float *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_srotm_usm>(cgh, [=]() {
+            ::srotm((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy,
+                    param);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotm(cl::sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y,
+                     int64_t incy, double *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_drotm_usm>(cgh, [=]() {
+            ::drotm((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy,
+                    param);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
+                      float *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_srotmg_usm>(cgh, [=]() {
+            ::srotmg(d1, d2, x1, (float *)&y1, param);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1,
+                      double *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_drotmg_usm>(cgh, [=]() {
+            ::drotmg(d1, d2, x1, (double *)&y1, param);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, float alpha, float *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_sscal_usm>(cgh, [=]() {
+            ::sscal((const MKL_INT *)&n, (const float *)&alpha, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, double alpha, double *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dscal_usm>(cgh, [=]() {
+            ::dscal((const MKL_INT *)&n, (const double *)&alpha, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, std::complex<float> alpha,
+                     std::complex<float> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_cscal_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::cscal((const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, float alpha, std::complex<float> *x,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_csscal_usm>(cgh, [=]() {
+            ::csscal((const MKL_INT *)&n, (const float *)&alpha, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, std::complex<double> alpha,
+                     std::complex<double> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_zscal_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::zscal((const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, double alpha, std::complex<double> *x,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zdscal_usm>(cgh, [=]() {
+            ::zdscal((const MKL_INT *)&n, (const double *)&alpha, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
+                       const float *y, int64_t incy, float *result,
+                       const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_sdsdot_usm>(cgh, [=]() {
+            result[0] = ::sdsdot((const MKL_INT *)&n, (const float *)&sb, x, (const MKL_INT *)&incx,
+                                 y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_sswap_usm>(cgh, [=]() {
+            ::sswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dswap_usm>(cgh, [=]() {
+            ::dswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, std::complex<float> *x, int64_t incx,
+                     std::complex<float> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_cswap_usm>(cgh, [=]() {
+            ::cswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, std::complex<double> *x, int64_t incx,
+                     std::complex<double> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_zswap_usm>(cgh, [=]() {
+            ::zswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
 } // namespace mklcpu
 } // namespace onemkl
diff --git a/src/blas/backends/mklcpu/cpu_level2.cpp b/src/blas/backends/mklcpu/cpu_level2.cpp
index 0a1197c4e..dc81936b5 100644
--- a/src/blas/backends/mklcpu/cpu_level2.cpp
+++ b/src/blas/backends/mklcpu/cpu_level2.cpp
@@ -25,6 +25,8 @@
 namespace onemkl {
 namespace mklcpu {
 
+// Buffer APIs
+
 void gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
           float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &x,
           int64_t incx, float beta, cl::sycl::buffer<float, 1> &y, int64_t incy) {
@@ -1175,5 +1177,1302 @@ void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_d
     });
 }
 
+// USM APIs
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,
+                     int64_t ku, float alpha, const float *a, int64_t lda, const float *x,
+                     int64_t incx, float beta, float *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        host_task<class mkl_kernel_sgbmv_usm>(cgh, [=]() {
+            ::sgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const float *)&alpha, a,
+                    (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const float *)&beta, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,
+                     int64_t ku, double alpha, const double *a, int64_t lda, const double *x,
+                     int64_t incx, double beta, double *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        host_task<class mkl_kernel_dgbmv_usm>(cgh, [=]() {
+            ::dgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const double *)&alpha, a,
+                    (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const double *)&beta, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,
+                     int64_t ku, std::complex<float> alpha, const std::complex<float> *a,
+                     int64_t lda, const std::complex<float> *x, int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_cgbmv_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::cgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const MKL_Complex8 *)&alpha_, a,
+                    (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_,
+                    y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,
+                     int64_t ku, std::complex<double> alpha, const std::complex<double> *a,
+                     int64_t lda, const std::complex<double> *x, int64_t incx,
+                     std::complex<double> beta, std::complex<double> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zgbmv_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const MKL_Complex16 *)&alpha_, a,
+                    (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_,
+                    y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
+                     const float *a, int64_t lda, const float *x, int64_t incx, float beta,
+                     float *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        host_task<class mkl_kernel_sgemv_usm>(cgh, [=]() {
+            ::sgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const float *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx,
+                    (const float *)&beta, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
+                     const double *a, int64_t lda, const double *x, int64_t incx, double beta,
+                     double *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        host_task<class mkl_kernel_dgemv_usm>(cgh, [=]() {
+            ::dgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const double *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx,
+                    (const double *)&beta, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
+                     const std::complex<float> *x, int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_cgemv_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::cgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                     const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char trans_ = *fortran_char(trans);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zgemv_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event ger(cl::sycl::queue &queue, int64_t m, int64_t n, float alpha, const float *x,
+                    int64_t incx, const float *y, int64_t incy, float *a, int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_sger_usm>(cgh, [=]() {
+            ::sger((const MKL_INT *)&m, (const MKL_INT *)&n, (const float *)&alpha, x,
+                   (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event ger(cl::sycl::queue &queue, int64_t m, int64_t n, double alpha, const double *x,
+                    int64_t incx, const double *y, int64_t incy, double *a, int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        host_task<class mkl_kernel_dger_usm>(cgh, [=]() {
+            ::dger((const MKL_INT *)&m, (const MKL_INT *)&n, (const double *)&alpha, x,
+                   (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
+                     int64_t incy, std::complex<float> *a, int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_cgerc_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::cgerc((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
+                     int64_t incy, std::complex<double> *a, int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_zgerc_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::zgerc((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
+                     int64_t incy, std::complex<float> *a, int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_cgeru_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::cgeru((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
+                     int64_t incy, std::complex<double> *a, int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_zgeru_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::zgeru((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
+                     std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
+                     const std::complex<float> *x, int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_chbmv_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::chbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k,
+                    (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
+                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                     const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zhbmv_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zhbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k,
+                    (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, int64_t lda, const std::complex<float> *x,
+                     int64_t incx, std::complex<float> beta, std::complex<float> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_chemv_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::chemv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_,
+                    a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx,
+                    (const MKL_Complex8 *)&beta_, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                     const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zhemv_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zhemv((const char *)&upper_lower_, (const MKL_INT *)&n,
+                    (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                    const std::complex<float> *x, int64_t incx, std::complex<float> *a, int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_cher_usm>(cgh, [=]() {
+            ::cher((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x,
+                   (const MKL_INT *)&incx, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                    const std::complex<double> *x, int64_t incx, std::complex<double> *a,
+                    int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_zher_usm>(cgh, [=]() {
+            ::zher((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x,
+                   (const MKL_INT *)&incx, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
+                     int64_t incy, std::complex<float> *a, int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_cher2_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::cher2((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_,
+                    x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
+                     const std::complex<double> *y, int64_t incy, std::complex<double> *a,
+                     int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_zher2_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::zher2((const char *)&upper_lower_, (const MKL_INT *)&n,
+                    (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *ap, const std::complex<float> *x, int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_chpmv_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::chpmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_,
+                    ap, x, (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *ap,
+                     const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zhpmv_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zhpmv((const char *)&upper_lower_, (const MKL_INT *)&n,
+                    (const MKL_Complex16 *)&alpha_, ap, x, (const MKL_INT *)&incx,
+                    (const MKL_Complex16 *)&beta_, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                    const std::complex<float> *x, int64_t incx, std::complex<float> *ap,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_chpr_usm>(cgh, [=]() {
+            ::chpr((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x,
+                   (const MKL_INT *)&incx, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                    const std::complex<double> *x, int64_t incx, std::complex<double> *ap,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_zhpr_usm>(cgh, [=]() {
+            ::zhpr((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x,
+                   (const MKL_INT *)&incx, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
+                     int64_t incy, std::complex<float> *ap,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_chpr2_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::chpr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_,
+                    x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
+                     const std::complex<double> *y, int64_t incy, std::complex<double> *ap,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_zhpr2_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::zhpr2((const char *)&upper_lower_, (const MKL_INT *)&n,
+                    (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx, y,
+                    (const MKL_INT *)&incy, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha,
+                     const float *a, int64_t lda, const float *x, int64_t incx, float beta,
+                     float *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_ssbmv_usm>(cgh, [=]() {
+            ::ssbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k,
+                    (const float *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx,
+                    (const float *)&beta, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha,
+                     const double *a, int64_t lda, const double *x, int64_t incx, double beta,
+                     double *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dsbmv_usm>(cgh, [=]() {
+            ::dsbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k,
+                    (const double *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx,
+                    (const double *)&beta, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                     const float *ap, const float *x, int64_t incx, float beta, float *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_sspmv_usm>(cgh, [=]() {
+            ::sspmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, ap, x,
+                    (const MKL_INT *)&incx, (const float *)&beta, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                     const double *ap, const double *x, int64_t incx, double beta, double *y,
+                     int64_t incy, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dspmv_usm>(cgh, [=]() {
+            ::dspmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, ap, x,
+                    (const MKL_INT *)&incx, (const double *)&beta, y, (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                    const float *x, int64_t incx, float *ap,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_sspr_usm>(cgh, [=]() {
+            ::sspr((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x,
+                   (const MKL_INT *)&incx, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                    const double *x, int64_t incx, double *ap,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dspr_usm>(cgh, [=]() {
+            ::dspr((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x,
+                   (const MKL_INT *)&incx, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                     const float *x, int64_t incx, const float *y, int64_t incy, float *ap,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_sspr2_usm>(cgh, [=]() {
+            ::sspr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                     const double *x, int64_t incx, const double *y, int64_t incy, double *ap,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dspr2_usm>(cgh, [=]() {
+            ::dspr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, ap);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                     const float *a, int64_t lda, const float *x, int64_t incx, float beta,
+                     float *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_ssymv_usm>(cgh, [=]() {
+            ::ssymv((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, a,
+                    (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const float *)&beta, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                     const double *a, int64_t lda, const double *x, int64_t incx, double beta,
+                     double *y, int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dsymv_usm>(cgh, [=]() {
+            ::dsymv((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, a,
+                    (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const double *)&beta, y,
+                    (const MKL_INT *)&incy);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                    const float *x, int64_t incx, float *a, int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_ssyr_usm>(cgh, [=]() {
+            ::ssyr((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x,
+                   (const MKL_INT *)&incx, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                    const double *x, int64_t incx, double *a, int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dsyr_usm>(cgh, [=]() {
+            ::dsyr((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x,
+                   (const MKL_INT *)&incx, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
+                     const float *x, int64_t incx, const float *y, int64_t incy, float *a,
+                     int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_ssyr2_usm>(cgh, [=]() {
+            ::ssyr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
+                     const double *x, int64_t incx, const double *y, int64_t incy, double *a,
+                     int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dsyr2_usm>(cgh, [=]() {
+            ::dsyr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x,
+                    (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const float *a, int64_t lda, float *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_stbmv_usm>(cgh, [=]() {
+            ::stbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const double *a, int64_t lda, double *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtbmv_usm>(cgh, [=]() {
+            ::dtbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const std::complex<float> *a, int64_t lda,
+                     std::complex<float> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ctbmv_usm>(cgh, [=]() {
+            ::ctbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const std::complex<double> *a, int64_t lda,
+                     std::complex<double> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ztbmv_usm>(cgh, [=]() {
+            ::ztbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const float *a, int64_t lda, float *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_stbsv_usm>(cgh, [=]() {
+            ::stbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const double *a, int64_t lda, double *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtbsv_usm>(cgh, [=]() {
+            ::dtbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const std::complex<float> *a, int64_t lda,
+                     std::complex<float> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ctbsv_usm>(cgh, [=]() {
+            ::ctbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, int64_t k, const std::complex<double> *a, int64_t lda,
+                     std::complex<double> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ztbsv_usm>(cgh, [=]() {
+            ::ztbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x,
+                    (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const float *ap, float *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_stpmv_usm>(cgh, [=]() {
+            ::stpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const double *ap, double *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtpmv_usm>(cgh, [=]() {
+            ::dtpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const std::complex<float> *ap, std::complex<float> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ctpmv_usm>(cgh, [=]() {
+            ::ctpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const std::complex<double> *ap, std::complex<double> *x,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ztpmv_usm>(cgh, [=]() {
+            ::ztpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const float *ap, float *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_stpsv_usm>(cgh, [=]() {
+            ::stpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const double *ap, double *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtpsv_usm>(cgh, [=]() {
+            ::dtpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const std::complex<float> *ap, std::complex<float> *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ctpsv_usm>(cgh, [=]() {
+            ::ctpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const std::complex<double> *ap, std::complex<double> *x,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ztpsv_usm>(cgh, [=]() {
+            ::ztpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag,
+                     int64_t n, const float *a, int64_t lda, float *b, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_strmv_usm>(cgh, [=]() {
+            ::strmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag,
+                     int64_t n, const double *a, int64_t lda, double *b, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtrmv_usm>(cgh, [=]() {
+            ::dtrmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag,
+                     int64_t n, const std::complex<float> *a, int64_t lda, std::complex<float> *b,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ctrmv_usm>(cgh, [=]() {
+            ::ctrmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag,
+                     int64_t n, const std::complex<double> *a, int64_t lda, std::complex<double> *b,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ztrmv_usm>(cgh, [=]() {
+            ::ztrmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const float *a, int64_t lda, float *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_strsv_usm>(cgh, [=]() {
+            ::strsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const double *a, int64_t lda, double *x, int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtrsv_usm>(cgh, [=]() {
+            ::dtrsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const std::complex<float> *a, int64_t lda, std::complex<float> *x,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ctrsv_usm>(cgh, [=]() {
+            ::ctrsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
+                     int64_t n, const std::complex<double> *a, int64_t lda, std::complex<double> *x,
+                     int64_t incx, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_ztrsv_usm>(cgh, [=]() {
+            ::ztrsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_,
+                    (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx);
+        });
+    });
+    return done;
+}
+
 } // namespace mklcpu
 } // namespace onemkl
diff --git a/src/blas/backends/mklcpu/cpu_level3.cpp b/src/blas/backends/mklcpu/cpu_level3.cpp
index d45eec31e..e3cb2b80e 100644
--- a/src/blas/backends/mklcpu/cpu_level3.cpp
+++ b/src/blas/backends/mklcpu/cpu_level3.cpp
@@ -25,6 +25,8 @@
 namespace onemkl {
 namespace mklcpu {
 
+// Buffer APIs
+
 void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
           int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
           cl::sycl::buffer<float, 1> &b, int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c,
@@ -641,5 +643,695 @@ void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose t
     });
 }
 
+// USM APIs
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                     int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
+                     int64_t ldb, float beta, float *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        host_task<class mkl_kernel_sgemm_usm>(cgh, [=]() {
+            ::sgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, (const float *)&alpha, a,
+                    (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const float *)&beta, c,
+                    (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                     int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
+                     const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        host_task<class mkl_kernel_dgemm_usm>(cgh, [=]() {
+            ::dgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, (const double *)&alpha, a,
+                    (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const double *)&beta, c,
+                    (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                     int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                     int64_t lda, const std::complex<float> *b, int64_t ldb,
+                     std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_cgemm_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::cgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a,
+                    (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_,
+                    c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m,
+                     int64_t n, int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, int64_t lda, const std::complex<double> *b,
+                     int64_t ldb, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char transa_ = *fortran_char(transa);
+        const char transb_ = *fortran_char(transb);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zgemm_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a,
+                    (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_,
+                    c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
+                     int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     int64_t lda, const std::complex<float> *b, int64_t ldb,
+                     std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_chemm_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::chemm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
+                     int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     int64_t lda, const std::complex<double> *b, int64_t ldb,
+                     std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zhemm_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zhemm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda,
+                    b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, c,
+                    (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                     int64_t k, float alpha, const std::complex<float> *a, int64_t lda, float beta,
+                     std::complex<float> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        host_task<class mkl_kernel_cherk_usm>(cgh, [=]() {
+            ::cherk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                    (const MKL_INT *)&k, (const float *)&alpha, a, (const MKL_INT *)&lda,
+                    (const float *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                     int64_t k, double alpha, const std::complex<double> *a, int64_t lda,
+                     double beta, std::complex<double> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        host_task<class mkl_kernel_zherk_usm>(cgh, [=]() {
+            ::zherk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                    (const MKL_INT *)&k, (const double *)&alpha, a, (const MKL_INT *)&lda,
+                    (const double *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                      int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                      int64_t lda, const std::complex<float> *b, int64_t ldb, float beta,
+                      std::complex<float> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_cher2k_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::cher2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                     (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda,
+                     b, (const MKL_INT *)&ldb, (const float *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                      int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                      int64_t lda, const std::complex<double> *b, int64_t ldb, double beta,
+                      std::complex<double> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_zher2k_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::zher2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                     (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda,
+                     b, (const MKL_INT *)&ldb, (const double *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
+                     int64_t n, float alpha, const float *a, int64_t lda, const float *b,
+                     int64_t ldb, float beta, float *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_ssymm_usm>(cgh, [=]() {
+            ::ssymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const float *)&alpha, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb, (const float *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
+                     int64_t n, double alpha, const double *a, int64_t lda, const double *b,
+                     int64_t ldb, double beta, double *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        host_task<class mkl_kernel_dsymm_usm>(cgh, [=]() {
+            ::dsymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const double *)&alpha, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb, (const double *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
+                     int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     int64_t lda, const std::complex<float> *b, int64_t ldb,
+                     std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_csymm_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::csymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
+                     int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     int64_t lda, const std::complex<double> *b, int64_t ldb,
+                     std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zsymm_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zsymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m,
+                    (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda,
+                    b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, c,
+                    (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                     int64_t k, float alpha, const float *a, int64_t lda, float beta, float *c,
+                     int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        host_task<class mkl_kernel_ssyrk_usm>(cgh, [=]() {
+            ::ssyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                    (const MKL_INT *)&k, (const float *)&alpha, a, (const MKL_INT *)&lda,
+                    (const float *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                     int64_t k, double alpha, const double *a, int64_t lda, double beta, double *c,
+                     int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        host_task<class mkl_kernel_dsyrk_usm>(cgh, [=]() {
+            ::dsyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                    (const MKL_INT *)&k, (const double *)&alpha, a, (const MKL_INT *)&lda,
+                    (const double *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                     int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                     int64_t lda, std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_csyrk_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::csyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                    (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda,
+                    (const MKL_Complex8 *)&beta_, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                     int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                     int64_t lda, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zsyrk_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zsyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                    (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda,
+                    (const MKL_Complex16 *)&beta_, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                      int64_t k, float alpha, const float *a, int64_t lda, const float *b,
+                      int64_t ldb, float beta, float *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        host_task<class mkl_kernel_ssyr2k_usm>(cgh, [=]() {
+            ::ssyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                     (const MKL_INT *)&k, (const float *)&alpha, a, (const MKL_INT *)&lda, b,
+                     (const MKL_INT *)&ldb, (const float *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                      int64_t k, double alpha, const double *a, int64_t lda, const double *b,
+                      int64_t ldb, double beta, double *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        host_task<class mkl_kernel_dsyr2k_usm>(cgh, [=]() {
+            ::dsyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                     (const MKL_INT *)&k, (const double *)&alpha, a, (const MKL_INT *)&lda, b,
+                     (const MKL_INT *)&ldb, (const double *)&beta, c, (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                      int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                      int64_t lda, const std::complex<float> *b, int64_t ldb,
+                      std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        float beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_csyr2k_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex8 beta_  = { beta_real, beta_imag };
+            ::csyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                     (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda,
+                     b, (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, c,
+                     (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
+                      int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                      int64_t lda, const std::complex<double> *b, int64_t ldb,
+                      std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char trans_       = *fortran_char(trans);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        double beta_real = beta.real(), beta_imag = beta.imag();
+        host_task<class mkl_kernel_zsyr2k_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            MKL_Complex16 beta_  = { beta_real, beta_imag };
+            ::zsyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n,
+                     (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda,
+                     b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, c,
+                     (const MKL_INT *)&ldc);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda,
+                     float *b, int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_strmm_usm>(cgh, [=]() {
+            ::strmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const float *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, double alpha, const double *a,
+                     int64_t lda, double *b, int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtrmm_usm>(cgh, [=]() {
+            ::dtrmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const double *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_ctrmm_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::ctrmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, int64_t lda, std::complex<double> *b,
+                     int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_ztrmm_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::ztrmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda,
+                     float *b, int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_strsm_usm>(cgh, [=]() {
+            ::strsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const float *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, double alpha, const double *a,
+                     int64_t lda, double *b, int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        host_task<class mkl_kernel_dtrsm_usm>(cgh, [=]() {
+            ::dtrsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const double *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        float alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_ctrsm_usm>(cgh, [=]() {
+            MKL_Complex8 alpha_ = { alpha_real, alpha_imag };
+            ::ctrsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
+                     diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, int64_t lda, std::complex<double> *b,
+                     int64_t ldb, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    auto done = queue.submit([&](cl::sycl::handler &cgh) {
+        int64_t num_events = dependencies.size();
+        for (int64_t i = 0; i < num_events; i++) {
+            cgh.depends_on(dependencies[i]);
+        }
+        const char left_right_  = *fortran_char(left_right);
+        const char upper_lower_ = *fortran_char(upper_lower);
+        const char transa_      = *fortran_char(transa);
+        const char unit_diag_   = *fortran_char(unit_diag);
+        double alpha_real = alpha.real(), alpha_imag = alpha.imag();
+        host_task<class mkl_kernel_ztrsm_usm>(cgh, [=]() {
+            MKL_Complex16 alpha_ = { alpha_real, alpha_imag };
+            ::ztrsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_,
+                    (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n,
+                    (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, b,
+                    (const MKL_INT *)&ldb);
+        });
+    });
+    return done;
+}
+
 } // namespace mklcpu
 } // namespace onemkl
diff --git a/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp b/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp
index 0cfa0483a..00d89481f 100644
--- a/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp
+++ b/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp
@@ -179,14 +179,6 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = {
     onemkl::mklcpu::gemm_batch,
     onemkl::mklcpu::gemm_batch,
     onemkl::mklcpu::gemm_batch,
-    onemkl::mklcpu::gemm_batch,
-    onemkl::mklcpu::gemm_batch,
-    onemkl::mklcpu::gemm_batch,
-    onemkl::mklcpu::gemm_batch,
-    onemkl::mklcpu::trsm_batch,
-    onemkl::mklcpu::trsm_batch,
-    onemkl::mklcpu::trsm_batch,
-    onemkl::mklcpu::trsm_batch,
     onemkl::mklcpu::trsm_batch,
     onemkl::mklcpu::trsm_batch,
     onemkl::mklcpu::trsm_batch,
@@ -202,4 +194,170 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = {
     onemkl::mklcpu::gemm_ext,
     onemkl::mklcpu::gemm_ext,
     onemkl::mklcpu::gemm_ext,
+    onemkl::mklcpu::asum,
+    onemkl::mklcpu::asum,
+    onemkl::mklcpu::asum,
+    onemkl::mklcpu::asum,
+    onemkl::mklcpu::axpy,
+    onemkl::mklcpu::axpy,
+    onemkl::mklcpu::axpy,
+    onemkl::mklcpu::axpy,
+    onemkl::mklcpu::axpy_batch,
+    onemkl::mklcpu::axpy_batch,
+    onemkl::mklcpu::axpy_batch,
+    onemkl::mklcpu::axpy_batch,
+    onemkl::mklcpu::copy,
+    onemkl::mklcpu::copy,
+    onemkl::mklcpu::copy,
+    onemkl::mklcpu::copy,
+    onemkl::mklcpu::dot,
+    onemkl::mklcpu::dot,
+    onemkl::mklcpu::dot,
+    onemkl::mklcpu::dotc,
+    onemkl::mklcpu::dotc,
+    onemkl::mklcpu::dotu,
+    onemkl::mklcpu::dotu,
+    onemkl::mklcpu::iamin,
+    onemkl::mklcpu::iamin,
+    onemkl::mklcpu::iamin,
+    onemkl::mklcpu::iamin,
+    onemkl::mklcpu::iamax,
+    onemkl::mklcpu::iamax,
+    onemkl::mklcpu::iamax,
+    onemkl::mklcpu::iamax,
+    onemkl::mklcpu::nrm2,
+    onemkl::mklcpu::nrm2,
+    onemkl::mklcpu::nrm2,
+    onemkl::mklcpu::nrm2,
+    onemkl::mklcpu::rot,
+    onemkl::mklcpu::rot,
+    onemkl::mklcpu::rot,
+    onemkl::mklcpu::rot,
+    onemkl::mklcpu::rotg,
+    onemkl::mklcpu::rotg,
+    onemkl::mklcpu::rotg,
+    onemkl::mklcpu::rotg,
+    onemkl::mklcpu::rotm,
+    onemkl::mklcpu::rotm,
+    onemkl::mklcpu::rotmg,
+    onemkl::mklcpu::rotmg,
+    onemkl::mklcpu::scal,
+    onemkl::mklcpu::scal,
+    onemkl::mklcpu::scal,
+    onemkl::mklcpu::scal,
+    onemkl::mklcpu::scal,
+    onemkl::mklcpu::scal,
+    onemkl::mklcpu::sdsdot,
+    onemkl::mklcpu::swap,
+    onemkl::mklcpu::swap,
+    onemkl::mklcpu::swap,
+    onemkl::mklcpu::swap,
+    onemkl::mklcpu::gbmv,
+    onemkl::mklcpu::gbmv,
+    onemkl::mklcpu::gbmv,
+    onemkl::mklcpu::gbmv,
+    onemkl::mklcpu::gemv,
+    onemkl::mklcpu::gemv,
+    onemkl::mklcpu::gemv,
+    onemkl::mklcpu::gemv,
+    onemkl::mklcpu::ger,
+    onemkl::mklcpu::ger,
+    onemkl::mklcpu::gerc,
+    onemkl::mklcpu::gerc,
+    onemkl::mklcpu::geru,
+    onemkl::mklcpu::geru,
+    onemkl::mklcpu::hbmv,
+    onemkl::mklcpu::hbmv,
+    onemkl::mklcpu::hemv,
+    onemkl::mklcpu::hemv,
+    onemkl::mklcpu::her,
+    onemkl::mklcpu::her,
+    onemkl::mklcpu::her2,
+    onemkl::mklcpu::her2,
+    onemkl::mklcpu::hpmv,
+    onemkl::mklcpu::hpmv,
+    onemkl::mklcpu::hpr,
+    onemkl::mklcpu::hpr,
+    onemkl::mklcpu::hpr2,
+    onemkl::mklcpu::hpr2,
+    onemkl::mklcpu::sbmv,
+    onemkl::mklcpu::sbmv,
+    onemkl::mklcpu::spmv,
+    onemkl::mklcpu::spmv,
+    onemkl::mklcpu::spr,
+    onemkl::mklcpu::spr,
+    onemkl::mklcpu::spr2,
+    onemkl::mklcpu::spr2,
+    onemkl::mklcpu::symv,
+    onemkl::mklcpu::symv,
+    onemkl::mklcpu::syr,
+    onemkl::mklcpu::syr,
+    onemkl::mklcpu::syr2,
+    onemkl::mklcpu::syr2,
+    onemkl::mklcpu::tbmv,
+    onemkl::mklcpu::tbmv,
+    onemkl::mklcpu::tbmv,
+    onemkl::mklcpu::tbmv,
+    onemkl::mklcpu::tbsv,
+    onemkl::mklcpu::tbsv,
+    onemkl::mklcpu::tbsv,
+    onemkl::mklcpu::tbsv,
+    onemkl::mklcpu::tpmv,
+    onemkl::mklcpu::tpmv,
+    onemkl::mklcpu::tpmv,
+    onemkl::mklcpu::tpmv,
+    onemkl::mklcpu::tpsv,
+    onemkl::mklcpu::tpsv,
+    onemkl::mklcpu::tpsv,
+    onemkl::mklcpu::tpsv,
+    onemkl::mklcpu::trmv,
+    onemkl::mklcpu::trmv,
+    onemkl::mklcpu::trmv,
+    onemkl::mklcpu::trmv,
+    onemkl::mklcpu::trsv,
+    onemkl::mklcpu::trsv,
+    onemkl::mklcpu::trsv,
+    onemkl::mklcpu::trsv,
+    onemkl::mklcpu::gemm,
+    onemkl::mklcpu::gemm,
+    onemkl::mklcpu::gemm,
+    onemkl::mklcpu::gemm,
+    onemkl::mklcpu::hemm,
+    onemkl::mklcpu::hemm,
+    onemkl::mklcpu::herk,
+    onemkl::mklcpu::herk,
+    onemkl::mklcpu::her2k,
+    onemkl::mklcpu::her2k,
+    onemkl::mklcpu::symm,
+    onemkl::mklcpu::symm,
+    onemkl::mklcpu::symm,
+    onemkl::mklcpu::symm,
+    onemkl::mklcpu::syrk,
+    onemkl::mklcpu::syrk,
+    onemkl::mklcpu::syrk,
+    onemkl::mklcpu::syrk,
+    onemkl::mklcpu::syr2k,
+    onemkl::mklcpu::syr2k,
+    onemkl::mklcpu::syr2k,
+    onemkl::mklcpu::syr2k,
+    onemkl::mklcpu::trmm,
+    onemkl::mklcpu::trmm,
+    onemkl::mklcpu::trmm,
+    onemkl::mklcpu::trmm,
+    onemkl::mklcpu::trsm,
+    onemkl::mklcpu::trsm,
+    onemkl::mklcpu::trsm,
+    onemkl::mklcpu::trsm,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemm_batch,
+    onemkl::mklcpu::gemmt,
+    onemkl::mklcpu::gemmt,
+    onemkl::mklcpu::gemmt,
+    onemkl::mklcpu::gemmt,
 };
diff --git a/src/blas/backends/mklgpu/CMakeLists.txt b/src/blas/backends/mklgpu/CMakeLists.txt
index 44f42facd..f2f35c831 100644
--- a/src/blas/backends/mklgpu/CMakeLists.txt
+++ b/src/blas/backends/mklgpu/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(${LIB_NAME})
 add_library(${LIB_OBJ} OBJECT
   mkl_internal_blas_gpu_wrappers.cpp
   mkl_blas_sycl_buffer.cpp
+  mkl_blas_sycl_usm.cpp
   $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_blas_gpu_wrappers.cpp>
 )
 
@@ -44,7 +45,6 @@ set_target_properties(${LIB_OBJ} PROPERTIES
 )
 target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
 
-#Set MKL libraries as not transitive for dynamic
 if(BUILD_SHARED_LIBS)
   set_target_properties(${LIB_NAME} PROPERTIES
     INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
diff --git a/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp b/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp
index a54717dd0..5d4f81e70 100644
--- a/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp
+++ b/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp
@@ -179,14 +179,6 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = {
     onemkl::mklgpu::gemm_batch,
     onemkl::mklgpu::gemm_batch,
     onemkl::mklgpu::gemm_batch,
-    onemkl::mklgpu::gemm_batch,
-    onemkl::mklgpu::gemm_batch,
-    onemkl::mklgpu::gemm_batch,
-    onemkl::mklgpu::gemm_batch,
-    onemkl::mklgpu::trsm_batch,
-    onemkl::mklgpu::trsm_batch,
-    onemkl::mklgpu::trsm_batch,
-    onemkl::mklgpu::trsm_batch,
     onemkl::mklgpu::trsm_batch,
     onemkl::mklgpu::trsm_batch,
     onemkl::mklgpu::trsm_batch,
@@ -202,4 +194,170 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = {
     onemkl::mklgpu::gemm_ext,
     onemkl::mklgpu::gemm_ext,
     onemkl::mklgpu::gemm_ext,
+    onemkl::mklgpu::asum,
+    onemkl::mklgpu::asum,
+    onemkl::mklgpu::asum,
+    onemkl::mklgpu::asum,
+    onemkl::mklgpu::axpy,
+    onemkl::mklgpu::axpy,
+    onemkl::mklgpu::axpy,
+    onemkl::mklgpu::axpy,
+    onemkl::mklgpu::axpy_batch,
+    onemkl::mklgpu::axpy_batch,
+    onemkl::mklgpu::axpy_batch,
+    onemkl::mklgpu::axpy_batch,
+    onemkl::mklgpu::copy,
+    onemkl::mklgpu::copy,
+    onemkl::mklgpu::copy,
+    onemkl::mklgpu::copy,
+    onemkl::mklgpu::dot,
+    onemkl::mklgpu::dot,
+    onemkl::mklgpu::dot,
+    onemkl::mklgpu::dotc,
+    onemkl::mklgpu::dotc,
+    onemkl::mklgpu::dotu,
+    onemkl::mklgpu::dotu,
+    onemkl::mklgpu::iamin,
+    onemkl::mklgpu::iamin,
+    onemkl::mklgpu::iamin,
+    onemkl::mklgpu::iamin,
+    onemkl::mklgpu::iamax,
+    onemkl::mklgpu::iamax,
+    onemkl::mklgpu::iamax,
+    onemkl::mklgpu::iamax,
+    onemkl::mklgpu::nrm2,
+    onemkl::mklgpu::nrm2,
+    onemkl::mklgpu::nrm2,
+    onemkl::mklgpu::nrm2,
+    onemkl::mklgpu::rot,
+    onemkl::mklgpu::rot,
+    onemkl::mklgpu::rot,
+    onemkl::mklgpu::rot,
+    onemkl::mklgpu::rotg,
+    onemkl::mklgpu::rotg,
+    onemkl::mklgpu::rotg,
+    onemkl::mklgpu::rotg,
+    onemkl::mklgpu::rotm,
+    onemkl::mklgpu::rotm,
+    onemkl::mklgpu::rotmg,
+    onemkl::mklgpu::rotmg,
+    onemkl::mklgpu::scal,
+    onemkl::mklgpu::scal,
+    onemkl::mklgpu::scal,
+    onemkl::mklgpu::scal,
+    onemkl::mklgpu::scal,
+    onemkl::mklgpu::scal,
+    onemkl::mklgpu::sdsdot,
+    onemkl::mklgpu::swap,
+    onemkl::mklgpu::swap,
+    onemkl::mklgpu::swap,
+    onemkl::mklgpu::swap,
+    onemkl::mklgpu::gbmv,
+    onemkl::mklgpu::gbmv,
+    onemkl::mklgpu::gbmv,
+    onemkl::mklgpu::gbmv,
+    onemkl::mklgpu::gemv,
+    onemkl::mklgpu::gemv,
+    onemkl::mklgpu::gemv,
+    onemkl::mklgpu::gemv,
+    onemkl::mklgpu::ger,
+    onemkl::mklgpu::ger,
+    onemkl::mklgpu::gerc,
+    onemkl::mklgpu::gerc,
+    onemkl::mklgpu::geru,
+    onemkl::mklgpu::geru,
+    onemkl::mklgpu::hbmv,
+    onemkl::mklgpu::hbmv,
+    onemkl::mklgpu::hemv,
+    onemkl::mklgpu::hemv,
+    onemkl::mklgpu::her,
+    onemkl::mklgpu::her,
+    onemkl::mklgpu::her2,
+    onemkl::mklgpu::her2,
+    onemkl::mklgpu::hpmv,
+    onemkl::mklgpu::hpmv,
+    onemkl::mklgpu::hpr,
+    onemkl::mklgpu::hpr,
+    onemkl::mklgpu::hpr2,
+    onemkl::mklgpu::hpr2,
+    onemkl::mklgpu::sbmv,
+    onemkl::mklgpu::sbmv,
+    onemkl::mklgpu::spmv,
+    onemkl::mklgpu::spmv,
+    onemkl::mklgpu::spr,
+    onemkl::mklgpu::spr,
+    onemkl::mklgpu::spr2,
+    onemkl::mklgpu::spr2,
+    onemkl::mklgpu::symv,
+    onemkl::mklgpu::symv,
+    onemkl::mklgpu::syr,
+    onemkl::mklgpu::syr,
+    onemkl::mklgpu::syr2,
+    onemkl::mklgpu::syr2,
+    onemkl::mklgpu::tbmv,
+    onemkl::mklgpu::tbmv,
+    onemkl::mklgpu::tbmv,
+    onemkl::mklgpu::tbmv,
+    onemkl::mklgpu::tbsv,
+    onemkl::mklgpu::tbsv,
+    onemkl::mklgpu::tbsv,
+    onemkl::mklgpu::tbsv,
+    onemkl::mklgpu::tpmv,
+    onemkl::mklgpu::tpmv,
+    onemkl::mklgpu::tpmv,
+    onemkl::mklgpu::tpmv,
+    onemkl::mklgpu::tpsv,
+    onemkl::mklgpu::tpsv,
+    onemkl::mklgpu::tpsv,
+    onemkl::mklgpu::tpsv,
+    onemkl::mklgpu::trmv,
+    onemkl::mklgpu::trmv,
+    onemkl::mklgpu::trmv,
+    onemkl::mklgpu::trmv,
+    onemkl::mklgpu::trsv,
+    onemkl::mklgpu::trsv,
+    onemkl::mklgpu::trsv,
+    onemkl::mklgpu::trsv,
+    onemkl::mklgpu::gemm,
+    onemkl::mklgpu::gemm,
+    onemkl::mklgpu::gemm,
+    onemkl::mklgpu::gemm,
+    onemkl::mklgpu::hemm,
+    onemkl::mklgpu::hemm,
+    onemkl::mklgpu::herk,
+    onemkl::mklgpu::herk,
+    onemkl::mklgpu::her2k,
+    onemkl::mklgpu::her2k,
+    onemkl::mklgpu::symm,
+    onemkl::mklgpu::symm,
+    onemkl::mklgpu::symm,
+    onemkl::mklgpu::symm,
+    onemkl::mklgpu::syrk,
+    onemkl::mklgpu::syrk,
+    onemkl::mklgpu::syrk,
+    onemkl::mklgpu::syrk,
+    onemkl::mklgpu::syr2k,
+    onemkl::mklgpu::syr2k,
+    onemkl::mklgpu::syr2k,
+    onemkl::mklgpu::syr2k,
+    onemkl::mklgpu::trmm,
+    onemkl::mklgpu::trmm,
+    onemkl::mklgpu::trmm,
+    onemkl::mklgpu::trmm,
+    onemkl::mklgpu::trsm,
+    onemkl::mklgpu::trsm,
+    onemkl::mklgpu::trsm,
+    onemkl::mklgpu::trsm,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemm_batch,
+    onemkl::mklgpu::gemmt,
+    onemkl::mklgpu::gemmt,
+    onemkl::mklgpu::gemmt,
+    onemkl::mklgpu::gemmt,
 };
diff --git a/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp b/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp
index 877254e5d..489bf8a2f 100644
--- a/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp
+++ b/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp
@@ -989,59 +989,6 @@ void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<
     onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy);
 }
 
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                                         beta, c, ldc, group_count, group_size);
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<double, 1> &beta, cl::sycl::buffer<double, 1> &c,
-                cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                                         beta, c, ldc, group_count, group_size);
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<std::complex<float>, 1> &beta,
-                cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                                         beta, c, ldc, group_count, group_size);
-}
-
-void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-    cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
-                                         beta, c, ldc, group_count, group_size);
-}
-
 void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
                 cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
@@ -1084,55 +1031,6 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp
                                          ldb, stride_b, beta, c, ldc, stride_c, batch_size);
 }
 
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n,
-                                         alpha, a, lda, b, ldb, group_count, group_size);
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n,
-                                         alpha, a, lda, b, ldb, group_count, group_size);
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n,
-                                         alpha, a, lda, b, ldb, group_count, group_size);
-}
-
-void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-    cl::sycl::buffer<onemkl::uplo, 1> &upper_lower, cl::sycl::buffer<onemkl::transpose, 1> &trans,
-    cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n,
-                                         alpha, a, lda, b, ldb, group_count, group_size);
-}
-
 void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
                 onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
                 float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
diff --git a/src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp b/src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp
new file mode 100644
index 000000000..325aec6ee
--- /dev/null
+++ b/src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp
@@ -0,0 +1,1332 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <CL/sycl.hpp>
+
+#include "mkl_internal_blas_gpu_wrappers.hpp"
+#include "onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
+#include "onemkl/types.hpp"
+
+namespace onemkl {
+namespace mklgpu {
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
+                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                          beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                          beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                          beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                                          beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
+                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b,
+                                          ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b,
+                                          ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b,
+                                          ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b,
+                                          ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b,
+                                          ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b,
+                                          ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                     float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
+                                          ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
+                                          ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
+                     std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
+                                          ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
+                     std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
+                                          ldc, dependencies);
+}
+
+cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
+                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
+                                          ldc, dependencies);
+}
+
+cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
+                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
+                                          ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, double alpha, const double *a,
+                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, dependencies);
+}
+
+cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, dependencies);
+}
+
+cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, double beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
+                                           beta, c, ldc, dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n,
+                                          alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
+                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
+                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta,
+                                          y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta,
+                                          y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta,
+                                          y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta,
+                                          y, incy, dependencies);
+}
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::ger(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                         dependencies);
+}
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                    double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::ger(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                         dependencies);
+}
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::geru(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::geru(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::her(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                         dependencies);
+}
+
+cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::her(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                         dependencies);
+}
+
+cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+}
+
+cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+}
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a,
+                                          dependencies);
+}
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a,
+                                          dependencies);
+}
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n,
+                     std::int64_t k, double alpha, const double *a, std::int64_t lda,
+                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                     const float *a, std::int64_t lda, const float *x, std::int64_t incx,
+                     float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
+                                          incy, dependencies);
+}
+
+cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                         dependencies);
+}
+
+cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                         dependencies);
+}
+
+cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,
+                                          dependencies);
+}
+
+cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                     const float *a, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                     const double *a, const double *x, std::int64_t incx, double beta, double *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,
+                                          dependencies);
+}
+
+cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+}
+
+cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies);
+}
+
+cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a,
+                                          dependencies);
+}
+
+cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a,
+                                          dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, std::int64_t k,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x,
+                                          incx, dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const float *a, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const double *a, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const float *a, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const double *a, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
+                     float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda,
+                     double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
+                     float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda,
+                     double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     onemkl::diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
+                                          dependencies);
+}
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::dotc(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::dotc(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::dotu(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::dotu(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x,
+                     std::int64_t incx, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x,
+                     std::int64_t incx, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x,
+                           std::int64_t *incx, float **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                                group_size, dependencies);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
+                           std::int64_t *incx, double **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                                group_size, dependencies);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+                           const std::complex<float> **x, std::int64_t *incx,
+                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                                group_size, dependencies);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+                           const std::complex<double> **x, std::int64_t *incx,
+                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count,
+                                                group_size, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, float *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::dot(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                    const double *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::dot(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x,
+                       std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                       const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::dot(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                    std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                    std::int64_t incx, std::complex<double> *y, std::int64_t incy, double c,
+                    double s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                    std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+                    std::int64_t incy, double c, double s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b,
+                     float *c, std::complex<float> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b,
+                     double *c, std::complex<double> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy, float *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotm(queue, n, x, incx, y, incy, param, dependencies);
+}
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy, double *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotm(queue, n, x, incx, y, incy, param, dependencies);
+}
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
+                      float *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+}
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1,
+                      double *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::rotmg(queue, d1, d2, x1, y1, param, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, float *alpha, const float **a, std::int64_t *lda,
+                           const float **b, std::int64_t *ldb, float *beta, float **c,
+                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                ldb, beta, c, ldc, group_count, group_size,
+                                                dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, double *alpha, const double **a, std::int64_t *lda,
+                           const double **b, std::int64_t *ldb, double *beta, double **c,
+                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                ldb, beta, c, ldc, group_count, group_size,
+                                                dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, std::complex<float> *alpha,
+                           const std::complex<float> **a, std::int64_t *lda,
+                           const std::complex<float> **b, std::int64_t *ldb,
+                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                ldb, beta, c, ldc, group_count, group_size,
+                                                dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, std::complex<double> *alpha,
+                           const std::complex<double> **a, std::int64_t *lda,
+                           const std::complex<double> **b, std::int64_t *ldb,
+                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                ldb, beta, c, ldc, group_count, group_size,
+                                                dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+                           const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
+                           float *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda,
+                                                stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                                batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
+                           double *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda,
+                                                stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                                batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<float> alpha, const std::complex<float> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
+                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda,
+                                                stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                                batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<double> alpha, const std::complex<double> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
+                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda,
+                                                stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                                                batch_size, dependencies);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha,
+                      const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+                      float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda,
+                                           b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha,
+                      const double *a, std::int64_t lda, const double *b, std::int64_t ldb,
+                      double beta, double *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda,
+                                           b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k,
+                      std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                      const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+                      std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda,
+                                           b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k,
+                      std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda,
+                                           b, ldb, beta, c, ldc, dependencies);
+}
+
+} // namespace mklgpu
+} // namespace onemkl
diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp
index 2181a4aba..ea02a5cc1 100644
--- a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp
+++ b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp
@@ -18,6 +18,7 @@
 *******************************************************************************/
 
 #include <CL/sycl.hpp>
+#include <cstdint>
 
 #include "include/allocator_helper.hpp"
 #include "mkl_internal_blas_gpu_wrappers.hpp"
@@ -27,833 +28,860 @@ namespace onemkl {
 namespace mklgpu {
 namespace internal {
 
-void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m,
-          int64_t n, int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
-          cl::sycl::buffer<float, 1> &b, int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c,
-          int64_t ldc) {
+// Buffer APIs
+
+void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+          cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &b,
+          std::int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
     mkl::gpu::sgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha,
                     a, lda, b, ldb, beta, c, ldc);
 }
 
-void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m,
-          int64_t n, int64_t k, double alpha, cl::sycl::buffer<double, 1> &a, int64_t lda,
-          cl::sycl::buffer<double, 1> &b, int64_t ldb, double beta, cl::sycl::buffer<double, 1> &c,
-          int64_t ldc) {
+void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+          std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+          cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &b,
+          std::int64_t ldb, double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc) {
     mkl::gpu::dgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha,
                     a, lda, b, ldb, beta, c, ldc);
 }
 
-void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m,
-          int64_t n, int64_t k, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
+void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
+          cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
     mkl::gpu::cgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha,
                     a, lda, b, ldb, beta, c, ldc);
 }
 
-void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m,
-          int64_t n, int64_t k, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
+void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
+          cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
     mkl::gpu::zgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha,
                     a, lda, b, ldb, beta, c, ldc);
 }
 
-void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m,
-          int64_t n, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
-          cl::sycl::buffer<float, 1> &b, int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c,
-          int64_t ldc) {
+void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+          std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
+          cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
     mkl::gpu::ssymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n,
                     alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m,
-          int64_t n, double alpha, cl::sycl::buffer<double, 1> &a, int64_t lda,
-          cl::sycl::buffer<double, 1> &b, int64_t ldb, double beta, cl::sycl::buffer<double, 1> &c,
-          int64_t ldc) {
+void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+          std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
+          cl::sycl::buffer<double, 1> &c, std::int64_t ldc) {
     mkl::gpu::dsymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n,
                     alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m,
-          int64_t n, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-          std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
+void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+          std::int64_t n, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
+          std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
     mkl::gpu::csymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n,
                     alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m,
-          int64_t n, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
+void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+          std::int64_t n, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+          std::int64_t ldc) {
     mkl::gpu::zsymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n,
                     alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m,
-          int64_t n, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-          std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
+void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+          std::int64_t n, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
+          std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
     mkl::gpu::chemm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n,
                     alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m,
-          int64_t n, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
+void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
+          std::int64_t n, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+          std::int64_t ldc) {
     mkl::gpu::zhemm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n,
                     alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-          int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda, float beta,
-          cl::sycl::buffer<float, 1> &c, int64_t ldc) {
+void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+          std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
+          cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
     mkl::gpu::ssyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                     a, lda, beta, c, ldc);
 }
 
-void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-          int64_t k, double alpha, cl::sycl::buffer<double, 1> &a, int64_t lda, double beta,
-          cl::sycl::buffer<double, 1> &c, int64_t ldc) {
+void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+          std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc) {
     mkl::gpu::dsyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                     a, lda, beta, c, ldc);
 }
 
-void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-          int64_t k, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-          int64_t ldc) {
+void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+          std::int64_t k, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+          std::int64_t lda, std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+          std::int64_t ldc) {
     mkl::gpu::csyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                     a, lda, beta, c, ldc);
 }
 
-void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-          int64_t k, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
-          int64_t ldc) {
+void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+          std::int64_t k, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+          std::int64_t ldc) {
     mkl::gpu::zsyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                     a, lda, beta, c, ldc);
 }
 
-void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-          int64_t k, float alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          float beta, cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
+void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+          std::int64_t k, float alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
+          std::int64_t lda, float beta, cl::sycl::buffer<std::complex<float>, 1> &c,
+          std::int64_t ldc) {
     mkl::gpu::cherk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                     a, lda, beta, c, ldc);
 }
 
-void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-          int64_t k, double alpha, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          double beta, cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
+void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
+          std::int64_t k, double alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+          std::int64_t ldc) {
     mkl::gpu::zherk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                     a, lda, beta, c, ldc);
 }
 
-void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-           int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
-           cl::sycl::buffer<float, 1> &b, int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c,
-           int64_t ldc) {
+void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+           std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
+           std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
+           cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
     mkl::gpu::ssyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                      a, lda, b, ldb, beta, c, ldc);
 }
 
-void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-           int64_t k, double alpha, cl::sycl::buffer<double, 1> &a, int64_t lda,
-           cl::sycl::buffer<double, 1> &b, int64_t ldb, double beta, cl::sycl::buffer<double, 1> &c,
-           int64_t ldc) {
+void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+           std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
+           std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
+           cl::sycl::buffer<double, 1> &c, std::int64_t ldc) {
     mkl::gpu::dsyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                      a, lda, b, ldb, beta, c, ldc);
 }
 
-void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-           int64_t k, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
+void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+           std::int64_t n, std::int64_t k, std::complex<float> alpha,
+           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+           cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
+           cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
     mkl::gpu::csyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                      a, lda, b, ldb, beta, c, ldc);
 }
 
-void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-           int64_t k, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
+void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+           std::int64_t n, std::int64_t k, std::complex<double> alpha,
+           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+           cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+           std::int64_t ldc) {
     mkl::gpu::zsyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                      a, lda, b, ldb, beta, c, ldc);
 }
 
-void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-           int64_t k, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, float beta,
-           cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
+void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+           std::int64_t n, std::int64_t k, std::complex<float> alpha,
+           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+           cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
+           cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
     mkl::gpu::cher2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                      a, lda, b, ldb, beta, c, ldc);
 }
 
-void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n,
-           int64_t k, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, double beta,
-           cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
+void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+           std::int64_t n, std::int64_t k, std::complex<double> alpha,
+           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+           cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
+           cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
     mkl::gpu::zher2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha,
                      a, lda, b, ldb, beta, c, ldc);
 }
 
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, float alpha,
-          cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &b, int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &b, std::int64_t ldb) {
     mkl::gpu::strmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, double alpha,
-          cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &b,
-          int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &b, std::int64_t ldb) {
     mkl::gpu::dtrmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n,
-          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
     mkl::gpu::ctrmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n,
-          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
     mkl::gpu::ztrmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, float alpha,
-          cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &b, int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &b, std::int64_t ldb) {
     mkl::gpu::strsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, double alpha,
-          cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &b,
-          int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &b, std::int64_t ldb) {
     mkl::gpu::dtrsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n,
-          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
     mkl::gpu::ctrsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n,
-          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
+          onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
     mkl::gpu::ztrsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower),
                     mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda,
                     b, ldb);
 }
 
-void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, float alpha,
-          cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, cl::sycl::buffer<float, 1> &y, int64_t incy) {
+void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
+          cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::sgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, double alpha,
-          cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, cl::sycl::buffer<double, 1> &y, int64_t incy) {
+void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
+          cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::dgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n,
-          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
+void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
     mkl::gpu::cgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n,
-          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
+          std::int64_t incy) {
     mkl::gpu::zgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl,
-          int64_t ku, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
-          cl::sycl::buffer<float, 1> &x, int64_t incx, float beta, cl::sycl::buffer<float, 1> &y,
-          int64_t incy) {
+void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          std::int64_t kl, std::int64_t ku, float alpha, cl::sycl::buffer<float, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
+          cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::sgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
                     incy);
 }
 
-void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl,
-          int64_t ku, double alpha, cl::sycl::buffer<double, 1> &a, int64_t lda,
-          cl::sycl::buffer<double, 1> &x, int64_t incx, double beta, cl::sycl::buffer<double, 1> &y,
-          int64_t incy) {
+void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          std::int64_t kl, std::int64_t ku, double alpha, cl::sycl::buffer<double, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
+          cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::dgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
                     incy);
 }
 
-void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl,
-          int64_t ku, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
+void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
     mkl::gpu::cgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
                     incy);
 }
 
-void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl,
-          int64_t ku, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
+          std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
+          std::int64_t incy) {
     mkl::gpu::zgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
                     incy);
 }
 
-void ger(cl::sycl::queue &queue, int64_t m, int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
-         int64_t incx, cl::sycl::buffer<float, 1> &y, int64_t incy, cl::sycl::buffer<float, 1> &a,
-         int64_t lda) {
+void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+         cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &y,
+         std::int64_t incy, cl::sycl::buffer<float, 1> &a, std::int64_t lda) {
     mkl::gpu::sger(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-void ger(cl::sycl::queue &queue, int64_t m, int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
-         int64_t incx, cl::sycl::buffer<double, 1> &y, int64_t incy, cl::sycl::buffer<double, 1> &a,
-         int64_t lda) {
+void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+         cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &y,
+         std::int64_t incy, cl::sycl::buffer<double, 1> &a, std::int64_t lda) {
     mkl::gpu::dger(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-void gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
+void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
+          cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
     mkl::gpu::cgerc(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-void gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
+void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
+          cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
     mkl::gpu::zgerc(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-void geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
+void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
+          cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
     mkl::gpu::cgeru(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-void geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
+void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
+          cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
     mkl::gpu::zgeru(queue, m, n, alpha, x, incx, y, incy, a, lda);
 }
 
-void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k,
-          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
+void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+          std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
     mkl::gpu::chbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k,
-          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+          std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
+          std::int64_t incy) {
     mkl::gpu::zhbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
+void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
     mkl::gpu::chemv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
+          std::int64_t incy) {
     mkl::gpu::zhemv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void her(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha,
-         cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
+void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+         cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
     mkl::gpu::cher(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda);
 }
 
-void her(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha,
-         cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
+void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+         cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
     mkl::gpu::zher(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda);
 }
 
-void her2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
+void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
+          cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
     mkl::gpu::cher2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda);
 }
 
-void her2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
+void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
+          cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
     mkl::gpu::zher2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda);
 }
 
-void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex<float> alpha,
+void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
-          int64_t incy) {
+          std::int64_t incx, std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
+          std::int64_t incy) {
     mkl::gpu::chpmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy);
 }
 
-void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex<double> alpha,
+void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &a,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
+          std::int64_t incy) {
     mkl::gpu::zhpmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy);
 }
 
-void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha,
-         cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
+void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
          cl::sycl::buffer<std::complex<float>, 1> &a) {
     mkl::gpu::chpr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a);
 }
 
-void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha,
-         cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
+void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+         cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
          cl::sycl::buffer<std::complex<double>, 1> &a) {
     mkl::gpu::zhpr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a);
 }
 
-void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
+void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &a) {
     mkl::gpu::chpr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a);
 }
 
-void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
+void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &a) {
     mkl::gpu::zhpr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a);
 }
 
-void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k, float alpha,
-          cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, cl::sycl::buffer<float, 1> &y, int64_t incy) {
+void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, float alpha,
+          cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &x,
+          std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::ssbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k, double alpha,
-          cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, cl::sycl::buffer<double, 1> &y, int64_t incy) {
+void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, double alpha,
+          cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &x,
+          std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::dsbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, float alpha,
-          cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x, int64_t incx, float beta,
-          cl::sycl::buffer<float, 1> &y, int64_t incy) {
+void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
+          cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+          float beta, cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::sspmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy);
 }
 
-void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, double alpha,
-          cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x, int64_t incx, double beta,
-          cl::sycl::buffer<double, 1> &y, int64_t incy) {
+void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
+          cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+          double beta, cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::dspmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy);
 }
 
-void spr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha,
-         cl::sycl::buffer<float, 1> &x, int64_t incx, cl::sycl::buffer<float, 1> &a) {
+void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+         cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &a) {
     mkl::gpu::sspr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a);
 }
 
-void spr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha,
-         cl::sycl::buffer<double, 1> &x, int64_t incx, cl::sycl::buffer<double, 1> &a) {
+void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+         cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &a) {
     mkl::gpu::dspr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a);
 }
 
-void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha,
-          cl::sycl::buffer<float, 1> &x, int64_t incx, cl::sycl::buffer<float, 1> &y, int64_t incy,
-          cl::sycl::buffer<float, 1> &a) {
+void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+          cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &y,
+          std::int64_t incy, cl::sycl::buffer<float, 1> &a) {
     mkl::gpu::sspr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a);
 }
 
-void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha,
-          cl::sycl::buffer<double, 1> &x, int64_t incx, cl::sycl::buffer<double, 1> &y,
-          int64_t incy, cl::sycl::buffer<double, 1> &a) {
+void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+          cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &y,
+          std::int64_t incy, cl::sycl::buffer<double, 1> &a) {
     mkl::gpu::dspr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a);
 }
 
-void symv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, float alpha,
-          cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, cl::sycl::buffer<float, 1> &y, int64_t incy) {
+void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
+          cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &x,
+          std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::ssymv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void symv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, double alpha,
-          cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, cl::sycl::buffer<double, 1> &y, int64_t incy) {
+void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
+          cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &x,
+          std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::dsymv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
-void syr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha,
-         cl::sycl::buffer<float, 1> &x, int64_t incx, cl::sycl::buffer<float, 1> &a, int64_t lda) {
+void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+         cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &a,
+         std::int64_t lda) {
     mkl::gpu::ssyr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda);
 }
 
-void syr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha,
-         cl::sycl::buffer<double, 1> &x, int64_t incx, cl::sycl::buffer<double, 1> &a,
-         int64_t lda) {
+void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+         cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &a,
+         std::int64_t lda) {
     mkl::gpu::dsyr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda);
 }
 
-void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha,
-          cl::sycl::buffer<float, 1> &x, int64_t incx, cl::sycl::buffer<float, 1> &y, int64_t incy,
-          cl::sycl::buffer<float, 1> &a, int64_t lda) {
+void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+          cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &y,
+          std::int64_t incy, cl::sycl::buffer<float, 1> &a, std::int64_t lda) {
     mkl::gpu::ssyr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda);
 }
 
-void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha,
-          cl::sycl::buffer<double, 1> &x, int64_t incx, cl::sycl::buffer<double, 1> &y,
-          int64_t incy, cl::sycl::buffer<double, 1> &a, int64_t lda) {
+void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+          cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &y,
+          std::int64_t incy, cl::sycl::buffer<double, 1> &a, std::int64_t lda) {
     mkl::gpu::dsyr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda);
 }
 
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<float, 1> &a, int64_t lda,
-          cl::sycl::buffer<float, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &x, std::int64_t incx) {
     mkl::gpu::stbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<double, 1> &a, int64_t lda,
-          cl::sycl::buffer<double, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &x, std::int64_t incx) {
     mkl::gpu::dtbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::ctbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::ztbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<float, 1> &a, int64_t lda,
-          cl::sycl::buffer<float, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &x, std::int64_t incx) {
     mkl::gpu::stbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<double, 1> &a, int64_t lda,
-          cl::sycl::buffer<double, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &x, std::int64_t incx) {
     mkl::gpu::dtbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::ctbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, int64_t k, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+          std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<double>, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::ztbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, k, a, lda, x, incx);
 }
 
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
+          std::int64_t incx) {
     mkl::gpu::stpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
+          std::int64_t incx) {
     mkl::gpu::dtpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::ctpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::ztpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
+          std::int64_t incx) {
     mkl::gpu::stpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
+          std::int64_t incx) {
     mkl::gpu::dtpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::ctpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::ztpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, x, incx);
 }
 
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &x,
-          int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &x, std::int64_t incx) {
     mkl::gpu::strmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &x,
-          int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &x, std::int64_t incx) {
     mkl::gpu::dtrmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::ctrmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::ztrmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &x,
-          int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<float, 1> &x, std::int64_t incx) {
     mkl::gpu::strsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &x,
-          int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<double, 1> &x, std::int64_t incx) {
     mkl::gpu::dtrsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::ctrsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
-          int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+          std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::ztrsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
                     mkl::cblas_convert(diag), n, a, lda, x, incx);
 }
 
-void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, cl::sycl::buffer<float, 1> &result) {
+void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<float, 1> &result) {
     mkl::gpu::scasum(queue, n, x, incx, result);
 }
 
-void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, cl::sycl::buffer<double, 1> &result) {
+void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<double, 1> &result) {
     mkl::gpu::dzasum(queue, n, x, incx, result);
 }
 
-void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
+void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &result) {
     mkl::gpu::sasum(queue, n, x, incx, result);
 }
 
-void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
+void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           cl::sycl::buffer<double, 1> &result) {
     mkl::gpu::dasum(queue, n, x, incx, result);
 }
 
-void axpy(cl::sycl::queue &queue, int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
-          int64_t incx, cl::sycl::buffer<float, 1> &y, int64_t incy) {
+void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::saxpy(queue, n, alpha, x, incx, y, incy);
 }
 
-void axpy(cl::sycl::queue &queue, int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
-          int64_t incx, cl::sycl::buffer<double, 1> &y, int64_t incy) {
+void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::daxpy(queue, n, alpha, x, incx, y, incy);
 }
 
-void axpy(cl::sycl::queue &queue, int64_t n, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
+void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
     mkl::gpu::caxpy(queue, n, alpha, x, incx, y, incy);
 }
 
-void axpy(cl::sycl::queue &queue, int64_t n, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
     mkl::gpu::zaxpy(queue, n, alpha, x, incx, y, incy);
 }
 
-void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-          cl::sycl::buffer<float, 1> &y, int64_t incy) {
+void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::scopy(queue, n, x, incx, y, incy);
 }
 
-void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-          cl::sycl::buffer<double, 1> &y, int64_t incy) {
+void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::dcopy(queue, n, x, incx, y, incy);
 }
 
-void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
+void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
     mkl::gpu::ccopy(queue, n, x, incx, y, incy);
 }
 
-void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
     mkl::gpu::zcopy(queue, n, x, incx, y, incy);
 }
 
-void dot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-         cl::sycl::buffer<float, 1> &y, int64_t incy, cl::sycl::buffer<float, 1> &result) {
+void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+         cl::sycl::buffer<float, 1> &y, std::int64_t incy, cl::sycl::buffer<float, 1> &result) {
     mkl::gpu::sdot(queue, n, x, incx, y, incy, result);
 }
 
-void dot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-         cl::sycl::buffer<double, 1> &y, int64_t incy, cl::sycl::buffer<double, 1> &result) {
+void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+         cl::sycl::buffer<double, 1> &y, std::int64_t incy, cl::sycl::buffer<double, 1> &result) {
     mkl::gpu::ddot(queue, n, x, incx, y, incy, result);
 }
 
-void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer<float, 1> &x,
-            int64_t incx, cl::sycl::buffer<float, 1> &y, int64_t incy,
+void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, cl::sycl::buffer<float, 1> &x,
+            std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
             cl::sycl::buffer<float, 1> &result) {
     mkl::gpu::sdsdot(queue, n, sb, x, incx, y, incy, result);
 }
 
-void dot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-         cl::sycl::buffer<float, 1> &y, int64_t incy, cl::sycl::buffer<double, 1> &result) {
+void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+         cl::sycl::buffer<float, 1> &y, std::int64_t incy, cl::sycl::buffer<double, 1> &result) {
     mkl::gpu::dsdot(queue, n, x, incx, y, incy, result);
 }
 
-void dotc(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
+void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &result) {
     mkl::gpu::cdotc(queue, n, x, incx, y, incy, result);
 }
 
-void dotc(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
+void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &result) {
     mkl::gpu::zdotc(queue, n, x, incx, y, incy, result);
 }
 
-void dotu(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
+void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &result) {
     mkl::gpu::cdotu(queue, n, x, incx, y, incy, result);
 }
 
-void dotu(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
+void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &result) {
     mkl::gpu::zdotu(queue, n, x, incx, y, incy, result);
 }
 
-void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, cl::sycl::buffer<float, 1> &result) {
+void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<float, 1> &result) {
     mkl::gpu::scnrm2(queue, n, x, incx, result);
 }
 
-void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, cl::sycl::buffer<double, 1> &result) {
+void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<double, 1> &result) {
     mkl::gpu::dznrm2(queue, n, x, incx, result);
 }
 
-void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
+void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &result) {
     mkl::gpu::snrm2(queue, n, x, incx, result);
 }
 
-void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
+void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           cl::sycl::buffer<double, 1> &result) {
     mkl::gpu::dnrm2(queue, n, x, incx, result);
 }
 
-void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-         int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy, float c,
+void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+         std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
          float s) {
     mkl::gpu::csrot(queue, n, x, incx, y, incy, c, s);
 }
 
-void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-         int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy, double c,
-         double s) {
+void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+         std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
+         double c, double s) {
     mkl::gpu::zdrot(queue, n, x, incx, y, incy, c, s);
 }
 
-void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-         cl::sycl::buffer<float, 1> &y, int64_t incy, float c, float s) {
+void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+         cl::sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
     mkl::gpu::srot(queue, n, x, incx, y, incy, c, s);
 }
 
-void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-         cl::sycl::buffer<double, 1> &y, int64_t incy, double c, double s) {
+void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+         cl::sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
     mkl::gpu::drot(queue, n, x, incx, y, incy, c, s);
 }
 
@@ -879,13 +907,13 @@ void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<double>, 1> &a,
     mkl::gpu::zrotg(queue, a, b, c, s);
 }
 
-void rotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-          cl::sycl::buffer<float, 1> &y, int64_t incy, cl::sycl::buffer<float, 1> &param) {
+void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<float, 1> &y, std::int64_t incy, cl::sycl::buffer<float, 1> &param) {
     mkl::gpu::srotm(queue, n, x, incx, y, incy, param);
 }
 
-void rotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-          cl::sycl::buffer<double, 1> &y, int64_t incy, cl::sycl::buffer<double, 1> &param) {
+void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<double, 1> &y, std::int64_t incy, cl::sycl::buffer<double, 1> &param) {
     mkl::gpu::drotm(queue, n, x, incx, y, incy, param);
 }
 
@@ -899,245 +927,96 @@ void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &d1, cl::sycl::bu
     mkl::gpu::drotmg(queue, d1, d2, x1, y1, param);
 }
 
-void scal(cl::sycl::queue &queue, int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
-          int64_t incx) {
+void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
+          std::int64_t incx) {
     mkl::gpu::sscal(queue, n, alpha, x, incx);
 }
 
-void scal(cl::sycl::queue &queue, int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
-          int64_t incx) {
+void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
+          std::int64_t incx) {
     mkl::gpu::dscal(queue, n, alpha, x, incx);
 }
 
-void scal(cl::sycl::queue &queue, int64_t n, std::complex<float> alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::cscal(queue, n, alpha, x, incx);
 }
 
-void scal(cl::sycl::queue &queue, int64_t n, std::complex<double> alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::zscal(queue, n, alpha, x, incx);
 }
 
-void scal(cl::sycl::queue &queue, int64_t n, float alpha,
-          cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
+void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
+          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
     mkl::gpu::csscal(queue, n, alpha, x, incx);
 }
 
-void scal(cl::sycl::queue &queue, int64_t n, double alpha,
-          cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
+void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
+          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
     mkl::gpu::zdscal(queue, n, alpha, x, incx);
 }
 
-void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-          cl::sycl::buffer<float, 1> &y, int64_t incy) {
+void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<float, 1> &y, std::int64_t incy) {
     mkl::gpu::sswap(queue, n, x, incx, y, incy);
 }
 
-void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-          cl::sycl::buffer<double, 1> &y, int64_t incy) {
+void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
+          cl::sycl::buffer<double, 1> &y, std::int64_t incy) {
     mkl::gpu::dswap(queue, n, x, incx, y, incy);
 }
 
-void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
+void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
     mkl::gpu::cswap(queue, n, x, incx, y, incy);
 }
 
-void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
+void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+          std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
     mkl::gpu::zswap(queue, n, x, incx, y, incy);
 }
 
-void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-           cl::sycl::buffer<int64_t, 1> &result) {
+void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+           cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::isamax(queue, n, x, incx, result);
 }
 
-void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-           cl::sycl::buffer<int64_t, 1> &result) {
+void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+           std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::idamax(queue, n, x, incx, result);
 }
 
-void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-           int64_t incx, cl::sycl::buffer<int64_t, 1> &result) {
+void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+           std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::icamax(queue, n, x, incx, result);
 }
 
-void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-           int64_t incx, cl::sycl::buffer<int64_t, 1> &result) {
+void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+           std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::izamax(queue, n, x, incx, result);
 }
 
-void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-           cl::sycl::buffer<int64_t, 1> &result) {
+void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
+           cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::isamin(queue, n, x, incx, result);
 }
 
-void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-           cl::sycl::buffer<int64_t, 1> &result) {
+void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
+           std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::idamin(queue, n, x, incx, result);
 }
 
-void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-           int64_t incx, cl::sycl::buffer<int64_t, 1> &result) {
+void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
+           std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::icamin(queue, n, x, incx, result);
 }
 
-void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-           int64_t incx, cl::sycl::buffer<int64_t, 1> &result) {
+void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
+           std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result) {
     mkl::gpu::izamin(queue, n, x, incx, result);
 }
 
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>();
-    auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto k_acc          = k.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>();
-    auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a =
-            (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i];
-        stride_b =
-            (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i];
-        stride_c = ldc_acc[i] * n_acc[i];
-        mkl::gpu::sgemm_batch(
-            queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i],
-            n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b,
-            beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-        off_c += stride_c * group_size_acc[i];
-    }
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<double, 1> &beta, cl::sycl::buffer<double, 1> &c,
-                cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>();
-    auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto k_acc          = k.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>();
-    auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a =
-            (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i];
-        stride_b =
-            (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i];
-        stride_c = ldc_acc[i] * n_acc[i];
-        mkl::gpu::dgemm_batch(
-            queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i],
-            n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b,
-            beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-        off_c += stride_c * group_size_acc[i];
-    }
-}
-
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<std::complex<float>, 1> &beta,
-                cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>();
-    auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto k_acc          = k.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>();
-    auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a =
-            (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i];
-        stride_b =
-            (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i];
-        stride_c = ldc_acc[i] * n_acc[i];
-        mkl::gpu::cgemm_batch(
-            queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i],
-            n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b,
-            beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-        off_c += stride_c * group_size_acc[i];
-    }
-}
-
-void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-    cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto transa_acc     = transa.get_access<cl::sycl::access::mode::read>();
-    auto transb_acc     = transb.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto k_acc          = k.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto beta_acc       = beta.get_access<cl::sycl::access::mode::read>();
-    auto ldc_acc        = ldc.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a =
-            (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i];
-        stride_b =
-            (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i];
-        stride_c = ldc_acc[i] * n_acc[i];
-        mkl::gpu::zgemm_batch(
-            queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i],
-            n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b,
-            beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-        off_c += stride_c * group_size_acc[i];
-    }
-}
-
 void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
                 cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
@@ -1184,135 +1063,6 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp
                           batch_size);
 }
 
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>();
-    auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>();
-    auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>();
-    auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, off_a = 0, off_b = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-        stride_b = ldb_acc[i] * n_acc[i];
-        mkl::gpu::strsm_batch(queue, mkl::cblas_convert(side_acc[i]),
-                              mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]),
-                              mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a,
-                              lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i],
-                              off_a, off_b);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-    }
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>();
-    auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>();
-    auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>();
-    auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, off_a = 0, off_b = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-        stride_b = ldb_acc[i] * n_acc[i];
-        mkl::gpu::dtrsm_batch(queue, mkl::cblas_convert(side_acc[i]),
-                              mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]),
-                              mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a,
-                              lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i],
-                              off_a, off_b);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-    }
-}
-
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>();
-    auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>();
-    auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>();
-    auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, off_a = 0, off_b = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-        stride_b = ldb_acc[i] * n_acc[i];
-        mkl::gpu::ctrsm_batch(queue, mkl::cblas_convert(side_acc[i]),
-                              mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]),
-                              mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a,
-                              lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i],
-                              off_a, off_b);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-    }
-}
-
-void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-    cl::sycl::buffer<onemkl::uplo, 1> &upper_lower, cl::sycl::buffer<onemkl::transpose, 1> &trans,
-    cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    auto side_acc       = left_right.get_access<cl::sycl::access::mode::read>();
-    auto uplo_acc       = upper_lower.get_access<cl::sycl::access::mode::read>();
-    auto trans_acc      = trans.get_access<cl::sycl::access::mode::read>();
-    auto diag_acc       = unit_diag.get_access<cl::sycl::access::mode::read>();
-    auto m_acc          = m.get_access<cl::sycl::access::mode::read>();
-    auto n_acc          = n.get_access<cl::sycl::access::mode::read>();
-    auto alpha_acc      = alpha.get_access<cl::sycl::access::mode::read>();
-    auto lda_acc        = lda.get_access<cl::sycl::access::mode::read>();
-    auto ldb_acc        = ldb.get_access<cl::sycl::access::mode::read>();
-    auto group_size_acc = group_size.get_access<cl::sycl::access::mode::read>();
-    int64_t stride_a, stride_b, off_a = 0, off_b = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i];
-        stride_b = ldb_acc[i] * n_acc[i];
-        mkl::gpu::ztrsm_batch(queue, mkl::cblas_convert(side_acc[i]),
-                              mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]),
-                              mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a,
-                              lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i],
-                              off_a, off_b);
-        off_a += stride_a * group_size_acc[i];
-        off_b += stride_b * group_size_acc[i];
-    }
-}
-
 void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
                 onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
                 float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
@@ -1356,51 +1106,52 @@ void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo up
 }
 
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
-           onemkl::transpose transb, int64_t n, int64_t k, float alpha,
-           cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &b, int64_t ldb,
-           float beta, cl::sycl::buffer<float, 1> &c, int64_t ldc) {
+           onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha,
+           cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &b,
+           std::int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
     mkl::gpu::sgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
                      mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
-           onemkl::transpose transb, int64_t n, int64_t k, double alpha,
-           cl::sycl::buffer<double, 1> &a, int64_t lda, cl::sycl::buffer<double, 1> &b, int64_t ldb,
-           double beta, cl::sycl::buffer<double, 1> &c, int64_t ldc) {
+           onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha,
+           cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &b,
+           std::int64_t ldb, double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc) {
     mkl::gpu::dgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
                      mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
-           onemkl::transpose transb, int64_t n, int64_t k, std::complex<double> alpha,
-           cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-           cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
+           onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
+           cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
+           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
+           std::int64_t ldc) {
     mkl::gpu::zgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
                      mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
-           onemkl::transpose transb, int64_t n, int64_t k, std::complex<float> alpha,
-           cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-           cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-           cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
+           onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
+           cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
+           cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
     mkl::gpu::cgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
                      mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m,
-          int64_t n, int64_t k, half alpha, cl::sycl::buffer<half, 1> &a, int64_t lda,
-          cl::sycl::buffer<half, 1> &b, int64_t ldb, half beta, cl::sycl::buffer<half, 1> &c,
-          int64_t ldc) {
+void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+          std::int64_t m, std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer<half, 1> &a,
+          std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
+          cl::sycl::buffer<half, 1> &c, std::int64_t ldc) {
     mkl::gpu::hgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha,
                     a, lda, b, ldb, beta, c, ldc);
 }
 
-void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m,
-              int64_t n, int64_t k, float alpha, cl::sycl::buffer<half, 1> &a, int64_t lda,
-              cl::sycl::buffer<half, 1> &b, int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c,
-              int64_t ldc) {
+void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+              cl::sycl::buffer<half, 1> &a, std::int64_t lda, cl::sycl::buffer<half, 1> &b,
+              std::int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc) {
     mkl::gpu::gemm_f16f16f32(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k,
                              alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -1459,40 +1210,1457 @@ static inline void copy_mat(T_src &src, int row, int col, int ld, onemkl::offset
 }
 
 void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
-              onemkl::offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-              cl::sycl::buffer<int8_t, 1> &a, int64_t lda, int8_t ao,
-              cl::sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-              cl::sycl::buffer<int32_t, 1> &c, int64_t ldc, cl::sycl::buffer<int32_t, 1> &co) {
+              onemkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+              cl::sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
+              cl::sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
+              cl::sycl::buffer<int32_t, 1> &c, std::int64_t ldc, cl::sycl::buffer<int32_t, 1> &co) {
     // DGEMM is used for reference implementation to maximize accuracy.
     // Optimized implementation for specific architectures will be added in future releases.
-    int64_t sizea, sizeb, sizec;
-    sizea       = (transa == onemkl::transpose::nontrans) ? lda * k : lda * m;
-    sizeb       = (transb == onemkl::transpose::nontrans) ? ldb * n : ldb * k;
-    sizec       = ldc * n;
-    double *ad  = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizea);
-    double *bd  = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizeb);
-    double *cd  = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizec);
-    double aod  = ao;
-    double bod  = bo;
-    auto acc_a  = a.template get_access<cl::sycl::access::mode::read>();
-    auto acc_b  = b.template get_access<cl::sycl::access::mode::read>();
-    auto acc_c  = c.template get_access<cl::sycl::access::mode::read_write>();
-    auto acc_co = co.template get_access<cl::sycl::access::mode::read_write>();
-    copy_mat(acc_a, transa, m, k, lda, aod, ad);
-    copy_mat(acc_b, transb, k, n, ldb, bod, bd);
-    copy_mat(acc_c, onemkl::transpose::nontrans, m, n, ldc, 0.0, cd);
-    cl::sycl::buffer<double, 1> A_buf(ad, sizea);
-    cl::sycl::buffer<double, 1> B_buf(bd, sizeb);
-    cl::sycl::buffer<double, 1> C_buf(cd, sizec);
-    mkl::gpu::dgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha,
-                    A_buf, lda, B_buf, ldb, beta, C_buf, ldc);
-    auto acc_cd = C_buf.template get_access<cl::sycl::access::mode::read>();
-    copy_mat(acc_cd, m, n, ldc, offsetc, acc_co, acc_c);
+    std::int64_t sizea, sizeb, sizec;
+    sizea      = (transa == onemkl::transpose::nontrans) ? lda * k : lda * m;
+    sizeb      = (transb == onemkl::transpose::nontrans) ? ldb * n : ldb * k;
+    sizec      = ldc * n;
+    double *ad = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizea);
+    double *bd = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizeb);
+    double *cd = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizec);
+    {
+        double alphad = alpha;
+        double betad  = beta;
+        double aod    = ao;
+        double bod    = bo;
+        auto acc_a    = a.template get_access<cl::sycl::access::mode::read>();
+        auto acc_b    = b.template get_access<cl::sycl::access::mode::read>();
+        auto acc_c    = c.template get_access<cl::sycl::access::mode::read_write>();
+        auto acc_co   = co.template get_access<cl::sycl::access::mode::read_write>();
+        copy_mat(acc_a, transa, m, k, lda, aod, ad);
+        copy_mat(acc_b, transb, k, n, ldb, bod, bd);
+        copy_mat(acc_c, onemkl::transpose::nontrans, m, n, ldc, 0.0, cd);
+        cl::sycl::buffer<double, 1> A_buf(ad, sizea);
+        cl::sycl::buffer<double, 1> B_buf(bd, sizeb);
+        cl::sycl::buffer<double, 1> C_buf(cd, sizec);
+        mkl::gpu::dgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k,
+                        alphad, A_buf, lda, B_buf, ldb, betad, C_buf, ldc);
+        auto acc_cd = C_buf.template get_access<cl::sycl::access::mode::read>();
+        copy_mat(acc_cd, m, n, ldc, offsetc, acc_co, acc_c);
+    }
     onemkl::aligned_free(ad);
     onemkl::aligned_free(bd);
     onemkl::aligned_free(cd);
 }
 
+// USM APIs
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
+                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
+                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ssymm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta,
+                                c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsymm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta,
+                                c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::csymm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta,
+                                c, ldc, dependencies);
+}
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zsymm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta,
+                                c, ldc, dependencies);
+}
+
+cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::chemm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta,
+                                c, ldc, dependencies);
+}
+
+cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zhemm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta,
+                                c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                     float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ssyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                n, k, alpha, a, lda, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                n, k, alpha, a, lda, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
+                     std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::csyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                n, k, alpha, a, lda, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
+                     std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zsyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                n, k, alpha, a, lda, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
+                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cherk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                n, k, alpha, a, lda, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
+                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zherk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                n, k, alpha, a, lda, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ssyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                 n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, double alpha, const double *a,
+                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                 n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::csyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                 n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zsyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                 n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cher2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                 n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, double beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zher2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans),
+                                 n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::strmm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtrmm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctrmm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztrmm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::strsm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtrsm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctrsm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztrsm_sycl(&queue, mkl::cblas_convert(left_right),
+                                mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa),
+                                mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb,
+                                dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
+                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
+                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x,
+                                incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x,
+                                incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x,
+                                incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x,
+                                incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sger_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+}
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                    double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dger_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+}
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cgerc_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+}
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zgerc_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+}
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cgeru_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+}
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zgeru_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
+}
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::chbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zhbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::chemv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta,
+                                y, incy, dependencies);
+}
+
+cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zhemv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta,
+                                y, incy, dependencies);
+}
+
+cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cher_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda,
+                               dependencies);
+}
+
+cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    std::int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zher_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda,
+                               dependencies);
+}
+
+cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cher2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                lda, dependencies);
+}
+
+cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zher2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                lda, dependencies);
+}
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::chpmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y,
+                                incy, dependencies);
+}
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zhpmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y,
+                                incy, dependencies);
+}
+
+cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::chpr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a,
+                               dependencies);
+}
+
+cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zhpr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a,
+                               dependencies);
+}
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::chpr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                dependencies);
+}
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zhpr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                dependencies);
+}
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ssbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     double alpha, const double *a, std::int64_t lda, const double *x,
+                     std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx,
+                                beta, y, incy, dependencies);
+}
+
+cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
+                     const float *a, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sspmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y,
+                                incy, dependencies);
+}
+
+cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
+                     const double *a, const double *x, std::int64_t incx, double beta, double *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dspmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y,
+                                incy, dependencies);
+}
+
+cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sspr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a,
+                               dependencies);
+}
+
+cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dspr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a,
+                               dependencies);
+}
+
+cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sspr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                dependencies);
+}
+
+cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dspr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                dependencies);
+}
+
+cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
+                     const float *a, std::int64_t lda, const float *x, std::int64_t incx,
+                     float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ssymv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta,
+                                y, incy, dependencies);
+}
+
+cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsymv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta,
+                                y, incy, dependencies);
+}
+
+cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ssyr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda,
+                               dependencies);
+}
+
+cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsyr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda,
+                               dependencies);
+}
+
+cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ssyr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                lda, dependencies);
+}
+
+cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsyr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a,
+                                lda, dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::stbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::stbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::stpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::stpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, x, incx, dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::strmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda,
+                     double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtrmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctrmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztrmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::strsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda,
+                     double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dtrsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ctrsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ztrsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans),
+                                mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::scasum_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dzasum_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sasum_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dasum_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x,
+                     std::int64_t incx, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::saxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x,
+                     std::int64_t incx, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::daxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::caxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zaxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::scopy_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dcopy_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ccopy_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zcopy_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, float *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sdot_sycl(&queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                    const double *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::ddot_sycl(&queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x,
+                       std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                       const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sdsdot_sycl(&queue, n, sb, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dsdot_sycl(&queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cdotc_sycl(&queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zdotc_sycl(&queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cdotu_sycl(&queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zdotu_sycl(&queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::scnrm2_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dznrm2_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::snrm2_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dnrm2_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                    std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::csrot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                    std::int64_t incx, std::complex<double> *y, std::int64_t incy, double c,
+                    double s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zdrot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                    std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::srot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+                    std::int64_t incy, double c, double s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::drot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::srotg_sycl(&queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::drotg_sycl(&queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b,
+                     float *c, std::complex<float> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::crotg_sycl(&queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b,
+                     double *c, std::complex<double> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zrotg_sycl(&queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy, float *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::srotm_sycl(&queue, n, x, incx, y, incy, param, dependencies);
+}
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy, double *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::drotm_sycl(&queue, n, x, incx, y, incy, param, dependencies);
+}
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
+                      float *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::srotmg_sycl(&queue, d1, d2, x1, y1, param, dependencies);
+}
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1,
+                      double *param, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::drotmg_sycl(&queue, d1, d2, x1, y1, param, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sscal_sycl(&queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dscal_sycl(&queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cscal_sycl(&queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zscal_sycl(&queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::csscal_sycl(&queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zdscal_sycl(&queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sswap_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dswap_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cswap_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zswap_sycl(&queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::isamax_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::idamax_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::icamax_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::izamax_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::isamin_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::idamin_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::icamin_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::izamin_sycl(&queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
+                           const float *a, std::int64_t lda, std::int64_t stride_a, const float *b,
+                           std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
+                           std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                 n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+                                 stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
+                           const double *a, std::int64_t lda, std::int64_t stride_a,
+                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
+                           double *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                 n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+                                 stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<float> alpha, const std::complex<float> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
+                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                 n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+                                 stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb,
+                           std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<double> alpha, const std::complex<double> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
+                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m,
+                                 n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+                                 stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event *coalesce_events(cl::sycl::queue &queue, std::vector<cl::sycl::event *> &prereqs) {
+#ifdef _WIN64
+    for (std::int64_t i = 0; i < prereqs.size(); i++)
+        prereqs[i]->wait();
+    return new cl::sycl::event();
+#else
+    if (prereqs.size() > 0) {
+        return new cl::sycl::event(queue.submit([&](cl::sycl::handler &cgh) {
+            for (std::int64_t i = 0; i < prereqs.size(); i++)
+                cgh.depends_on(*prereqs[i]);
+            cgh.single_task<class coalesce_events_kernel>([]() {
+            });
+        }));
+    }
+    else
+        return new cl::sycl::event();
+#endif
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha,
+                           const float **a, std::int64_t *lda, const float **b, std::int64_t *ldb,
+                           float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::sgemm_batch(
+            queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i],
+            alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i],
+            dependencies));
+        coalesced_events.push_back(gemm_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha,
+                           const double **a, std::int64_t *lda, const double **b, std::int64_t *ldb,
+                           double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::dgemm_batch(
+            queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i],
+            alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i],
+            dependencies));
+        coalesced_events.push_back(gemm_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           std::complex<float> *alpha, const std::complex<float> **a,
+                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
+                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::cgemm_batch(
+            queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i],
+            alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i],
+            dependencies));
+        coalesced_events.push_back(gemm_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb,
+                           std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           std::complex<double> *alpha, const std::complex<double> **a,
+                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
+                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::zgemm_batch(
+            queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i],
+            alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i],
+            dependencies));
+        coalesced_events.push_back(gemm_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x,
+                           std::int64_t *incx, float **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *axpy_batch_event = new cl::sycl::event(
+            mkl::gpu::saxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
+                                  total_group_size, dependencies));
+        coalesced_events.push_back(axpy_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
+                           std::int64_t *incx, double **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *axpy_batch_event = new cl::sycl::event(
+            mkl::gpu::daxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
+                                  total_group_size, dependencies));
+        coalesced_events.push_back(axpy_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+                           const std::complex<float> **x, std::int64_t *incx,
+                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *axpy_batch_event = new cl::sycl::event(
+            mkl::gpu::caxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
+                                  total_group_size, dependencies));
+        coalesced_events.push_back(axpy_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+                           const std::complex<double> **x, std::int64_t *incx,
+                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    std::vector<cl::sycl::event *> coalesced_events;
+    coalesced_events.reserve(group_count);
+    std::int64_t total_group_size = 0;
+    for (std::int64_t i = 0; i < group_count; i++) {
+        cl::sycl::event *axpy_batch_event = new cl::sycl::event(
+            mkl::gpu::zaxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
+                                  total_group_size, dependencies));
+        coalesced_events.push_back(axpy_batch_event);
+        total_group_size += group_size[i];
+    }
+    return *coalesce_events(queue, coalesced_events);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::sgemmt_sycl(&queue, mkl::cblas_convert(upper_lower),
+                                 mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k,
+                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, double alpha, const double *a,
+                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::dgemmt_sycl(&queue, mkl::cblas_convert(upper_lower),
+                                 mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k,
+                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::cgemmt_sycl(&queue, mkl::cblas_convert(upper_lower),
+                                 mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k,
+                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return mkl::gpu::zgemmt_sycl(&queue, mkl::cblas_convert(upper_lower),
+                                 mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k,
+                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
 } //namespace internal
 } //namespace mklgpu
 } //namespace onemkl
diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp
index e8de7044b..41288a12d 100644
--- a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp
+++ b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp
@@ -28,526 +28,644 @@
 namespace onemkl {
 namespace mklgpu {
 namespace internal {
+
+// Buffer APIs
+
 void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
           std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
           cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &b,
           std::int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
 void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
           std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
           cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &b,
           std::int64_t ldb, double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
 void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
           std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
           cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
 void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
           std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
           cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
+
 void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
           std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
           cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
 void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
           std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
           cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
 void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
           std::int64_t n, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
           std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
 void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
           std::int64_t n, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
           std::int64_t ldc);
+
 void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
           std::int64_t n, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
           std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
 void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m,
           std::int64_t n, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
           std::int64_t ldc);
+
 void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
           std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
           cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
 void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
           std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
 void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
           std::int64_t k, std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
           std::int64_t lda, std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &c,
           std::int64_t ldc);
+
 void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
           std::int64_t k, std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
           std::int64_t ldc);
+
 void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
           std::int64_t k, float alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
           std::int64_t lda, float beta, cl::sycl::buffer<std::complex<float>, 1> &c,
           std::int64_t ldc);
+
 void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n,
           std::int64_t k, double alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, double beta, cl::sycl::buffer<std::complex<double>, 1> &c,
           std::int64_t ldc);
+
 void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
            std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
            std::int64_t lda, cl::sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
            cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
 void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
            std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer<double, 1> &a,
            std::int64_t lda, cl::sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
            cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
 void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
            std::int64_t n, std::int64_t k, std::complex<float> alpha,
            cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
            cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
 void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
            std::int64_t n, std::int64_t k, std::complex<double> alpha,
            cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
            std::int64_t ldc);
+
 void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
            std::int64_t n, std::int64_t k, std::complex<float> alpha,
            cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
            cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
 void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
            std::int64_t n, std::int64_t k, std::complex<double> alpha,
            cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
            cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
+
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
+
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
+
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
+
 void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
+
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &b, std::int64_t ldb);
+
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &b, std::int64_t ldb);
+
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
+
 void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
           onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
           std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
+
 void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
           cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
           cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
 void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
           std::int64_t incy);
+
 void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           std::int64_t kl, std::int64_t ku, float alpha, cl::sycl::buffer<float, 1> &a,
           std::int64_t lda, cl::sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
           cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           std::int64_t kl, std::int64_t ku, double alpha, cl::sycl::buffer<double, 1> &a,
           std::int64_t lda, cl::sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
           cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
 void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n,
           std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
           std::int64_t incy);
+
 void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
          cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &y,
          std::int64_t incy, cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+
 void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
          cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &y,
          std::int64_t incy, cl::sycl::buffer<double, 1> &a, std::int64_t lda);
+
 void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
+
 void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
+
 void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
+
 void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
+
 void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
           std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
 void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
           std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
           std::int64_t incy);
+
 void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
 void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
           std::int64_t incy);
+
 void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
          cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
+
 void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
          cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
+
 void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
+
 void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
+
 void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, std::complex<float> beta, cl::sycl::buffer<std::complex<float>, 1> &y,
           std::int64_t incy);
+
 void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &a,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &y,
           std::int64_t incy);
+
 void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
          cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
          cl::sycl::buffer<std::complex<float>, 1> &a);
+
 void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
          cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
          cl::sycl::buffer<std::complex<double>, 1> &a);
+
 void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &a);
+
 void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &a);
+
 void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, float alpha,
           cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &x,
           std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, double alpha,
           cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &x,
           std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
           cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           float beta, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
           cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           double beta, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
          cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &a);
+
 void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
          cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &a);
+
 void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &y,
           std::int64_t incy, cl::sycl::buffer<float, 1> &a);
+
 void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
           cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &y,
           std::int64_t incy, cl::sycl::buffer<double, 1> &a);
+
 void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
           cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &x,
           std::int64_t incx, float beta, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
           cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &x,
           std::int64_t incx, double beta, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
          cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &a,
          std::int64_t lda);
+
 void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
          cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &a,
          std::int64_t lda);
+
 void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx, cl::sycl::buffer<float, 1> &y,
           std::int64_t incy, cl::sycl::buffer<float, 1> &a, std::int64_t lda);
+
 void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
           cl::sycl::buffer<double, 1> &x, std::int64_t incx, cl::sycl::buffer<double, 1> &y,
           std::int64_t incy, cl::sycl::buffer<double, 1> &a, std::int64_t lda);
+
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<float>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, std::int64_t k, cl::sycl::buffer<std::complex<double>, 1> &a,
           std::int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
           std::int64_t incx);
+
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
           std::int64_t incx);
+
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &x,
           std::int64_t incx);
+
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &x,
           std::int64_t incx);
+
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<float, 1> &a, std::int64_t lda,
           cl::sycl::buffer<float, 1> &x, std::int64_t incx);
+
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
           cl::sycl::buffer<double, 1> &x, std::int64_t incx);
+
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag,
           std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<float, 1> &result);
+
 void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<double, 1> &result);
+
 void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &result);
+
 void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           cl::sycl::buffer<double, 1> &result);
+
 void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
           std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
           std::int64_t incx, cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
 void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
 void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
 void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
 void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
          cl::sycl::buffer<float, 1> &y, std::int64_t incy, cl::sycl::buffer<float, 1> &result);
+
 void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
          cl::sycl::buffer<double, 1> &y, std::int64_t incy, cl::sycl::buffer<double, 1> &result);
+
 void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, cl::sycl::buffer<float, 1> &x,
             std::int64_t incx, cl::sycl::buffer<float, 1> &y, std::int64_t incy,
             cl::sycl::buffer<float, 1> &result);
+
 void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
          cl::sycl::buffer<float, 1> &y, std::int64_t incy, cl::sycl::buffer<double, 1> &result);
+
 void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &result);
+
 void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &result);
+
 void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<float>, 1> &result);
+
 void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
           cl::sycl::buffer<std::complex<double>, 1> &result);
+
 void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<float, 1> &result);
+
 void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<double, 1> &result);
+
 void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &result);
+
 void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           cl::sycl::buffer<double, 1> &result);
+
 void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
          std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
          float s);
+
 void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
          std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
          double c, double s);
+
 void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
          cl::sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s);
+
 void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
          cl::sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s);
+
 void rotg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &b,
           cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<float, 1> &s);
+
 void rotg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &b,
           cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<double, 1> &s);
+
 void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<float>, 1> &a,
           cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<float, 1> &c,
           cl::sycl::buffer<std::complex<float>, 1> &s);
+
 void rotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<double>, 1> &a,
           cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<double, 1> &c,
           cl::sycl::buffer<std::complex<double>, 1> &s);
+
 void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &y, std::int64_t incy, cl::sycl::buffer<float, 1> &param);
+
 void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           cl::sycl::buffer<double, 1> &y, std::int64_t incy, cl::sycl::buffer<double, 1> &param);
+
 void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &d1, cl::sycl::buffer<float, 1> &d2,
            cl::sycl::buffer<float, 1> &x1, float y1, cl::sycl::buffer<float, 1> &param);
+
 void rotmg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &d1, cl::sycl::buffer<double, 1> &d2,
            cl::sycl::buffer<double, 1> &x1, double y1, cl::sycl::buffer<double, 1> &param);
+
 void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
           std::int64_t incx);
+
 void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
           std::int64_t incx);
+
 void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void scal(cl::sycl::queue &queue, std::int64_t n, float alpha,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
+
 void scal(cl::sycl::queue &queue, std::int64_t n, double alpha,
           cl::sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
+
 void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &y, std::int64_t incy);
+
 void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x, std::int64_t incx,
           cl::sycl::buffer<double, 1> &y, std::int64_t incy);
+
 void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
+
 void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
           std::int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
+
 void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
            cl::sycl::buffer<std::int64_t, 1> &result);
+
 void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
            std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
 void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
            std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
 void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
            std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
 void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<float, 1> &x, std::int64_t incx,
            cl::sycl::buffer<std::int64_t, 1> &result);
+
 void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<double, 1> &x,
            std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
 void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
            std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
+
 void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
            std::int64_t incx, cl::sycl::buffer<std::int64_t, 1> &result);
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<float, 1> &beta, cl::sycl::buffer<float, 1> &c,
-                cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size);
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<double, 1> &beta, cl::sycl::buffer<double, 1> &c,
-                cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size);
-void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-                cl::sycl::buffer<onemkl::transpose, 1> &transb,
-                cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<std::complex<float>, 1> &beta,
-                cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-void gemm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-    cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
+
 void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
                 cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
                 cl::sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
                 cl::sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
                 std::int64_t batch_size);
+
 void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
                 std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
                 cl::sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
                 cl::sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
                 double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc,
                 std::int64_t stride_c, std::int64_t batch_size);
+
 void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
                 cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
@@ -555,6 +673,7 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp
                 std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
                 cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
                 std::int64_t stride_c, std::int64_t batch_size);
+
 void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
                 cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
@@ -562,84 +681,61 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp
                 std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
                 cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
                 std::int64_t stride_c, std::int64_t batch_size);
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-                cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-                cl::sycl::buffer<onemkl::transpose, 1> &trans,
-                cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-void trsm_batch(
-    cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-    cl::sycl::buffer<onemkl::uplo, 1> &upper_lower, cl::sycl::buffer<onemkl::transpose, 1> &trans,
-    cl::sycl::buffer<onemkl::diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::complex<double>, 1> &alpha,
-    cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-    cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
+
 void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
                 onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
                 float alpha, cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
                 cl::sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
                 std::int64_t batch_size);
+
 void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
                 onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
                 double alpha, cl::sycl::buffer<double, 1> &a, std::int64_t lda,
                 std::int64_t stride_a, cl::sycl::buffer<double, 1> &b, std::int64_t ldb,
                 std::int64_t stride_b, std::int64_t batch_size);
+
 void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
                 onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
                 std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a,
                 std::int64_t lda, std::int64_t stride_a,
                 cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
                 std::int64_t stride_b, std::int64_t batch_size);
+
 void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
                 onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
                 std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
                 std::int64_t lda, std::int64_t stride_a,
                 cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
                 std::int64_t stride_b, std::int64_t batch_size);
+
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
            onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha,
            cl::sycl::buffer<float, 1> &a, std::int64_t lda, cl::sycl::buffer<float, 1> &b,
            std::int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c, std::int64_t ldc);
+
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
            onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha,
            cl::sycl::buffer<double, 1> &a, std::int64_t lda, cl::sycl::buffer<double, 1> &b,
            std::int64_t ldb, double beta, cl::sycl::buffer<double, 1> &c, std::int64_t ldc);
+
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
            onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
            cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
            cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c,
            std::int64_t ldc);
+
 void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
            onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
            cl::sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
            cl::sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
            cl::sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
+
 void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
           std::int64_t m, std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer<half, 1> &a,
           std::int64_t lda, cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
           cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
+
 void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
               cl::sycl::buffer<half, 1> &a, std::int64_t lda, cl::sycl::buffer<half, 1> &b,
@@ -651,6 +747,869 @@ void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpos
               cl::sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
               cl::sycl::buffer<int32_t, 1> &c, std::int64_t ldc, cl::sycl::buffer<int32_t, 1> &co);
 
+// USM APIs
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
+                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
+                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                     float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
+                     std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
+                     std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
+                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
+                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, double alpha, const double *a,
+                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, double beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
+                     std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower,
+                     onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
+                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
+                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                    std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                    double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k,
+                     double alpha, const double *a, std::int64_t lda, const double *x,
+                     std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
+                     const float *a, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
+                     const double *a, const double *x, std::int64_t incx, double beta, double *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha,
+                     const float *a, std::int64_t lda, const float *x, std::int64_t incx,
+                     float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                    const float *x, std::int64_t incx, float *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                    const double *x, std::int64_t incx, double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
+                     double *a, std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, std::int64_t k,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda,
+                     double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda,
+                     double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans,
+                     onemkl::diag diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x,
+                     std::int64_t incx, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x,
+                     std::int64_t incx, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, float *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                    const double *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x,
+                       std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                       const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                    const float *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
+                     std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
+                     std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                     float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                     double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                    std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                    std::int64_t incx, std::complex<double> *y, std::int64_t incy, double c,
+                    double s, const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                    std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
+                    std::int64_t incy, double c, double s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<float> *a, std::complex<float> *b,
+                     float *c, std::complex<float> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotg(cl::sycl::queue &queue, std::complex<double> *a, std::complex<double> *b,
+                     double *c, std::complex<double> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy, float *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy, double *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
+                      float *param,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1,
+                      double *param,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
+                      std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, float *alpha, const float **a, std::int64_t *lda,
+                           const float **b, std::int64_t *ldb, float *beta, float **c,
+                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, double *alpha, const double **a, std::int64_t *lda,
+                           const double **b, std::int64_t *ldb, double *beta, double **c,
+                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, std::complex<float> *alpha,
+                           const std::complex<float> **a, std::int64_t *lda,
+                           const std::complex<float> **b, std::int64_t *ldb,
+                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa,
+                           onemkl::transpose *transb, std::int64_t *m, std::int64_t *n,
+                           std::int64_t *k, std::complex<double> *alpha,
+                           const std::complex<double> **a, std::int64_t *lda,
+                           const std::complex<double> **b, std::int64_t *ldb,
+                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+                           const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
+                           float *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
+                           double *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<float> alpha, const std::complex<float> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
+                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa,
+                           onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<double> alpha, const std::complex<double> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
+                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x,
+                           std::int64_t *incx, float **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
+                           std::int64_t *incx, double **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+                           const std::complex<float> **x, std::int64_t *incx,
+                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+                           const std::complex<double> **x, std::int64_t *incx,
+                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
+                           std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha,
+                      const float *a, std::int64_t lda, const float *b, std::int64_t ldb,
+                      float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha,
+                      const double *a, std::int64_t lda, const double *b, std::int64_t ldb,
+                      double beta, double *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k,
+                      std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
+cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa,
+                      onemkl::transpose transb, std::int64_t n, std::int64_t k,
+                      std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                      const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
+                      std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies = {});
+
 } //namespace internal
 } //namespace mklgpu
 } //namespace onemkl
diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp
index 1c4e54c46..6a68a73a0 100644
--- a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp
+++ b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp
@@ -33,12 +33,6 @@ typedef enum { MKL_NONUNIT = 131, MKL_UNIT = 132 } MKL_DIAG;
 
 typedef enum { MKL_LEFT = 141, MKL_RIGHT = 142 } MKL_SIDE;
 
-typedef enum {
-    MKL_COMPACT_SSE    = 181,
-    MKL_COMPACT_AVX    = 182,
-    MKL_COMPACT_AVX512 = 183
-} MKL_COMPACT_PACK;
-
 enum CBLAS_OFFSET { CblasRowOffset = 171, CblasColOffset = 172, CblasFixOffset = 173 };
 typedef enum CBLAS_OFFSET CBLAS_OFFSET;
 
@@ -88,7 +82,7 @@ inline CBLAS_OFFSET cblas_convert(onemkl::offset o) {
 
 namespace gpu {
 
-// gemm
+// Buffer APIs
 
 void sgemm(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t m, int64_t n,
            int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
@@ -110,8 +104,6 @@ void zgemm(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, i
            int64_t lda, cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
 
-// symm
-
 void ssymm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, int64_t m, int64_t n,
            float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &b,
            int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c, int64_t ldc);
@@ -131,8 +123,6 @@ void zsymm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, in
            cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
            cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
 
-// hemm
-
 void chemm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, int64_t m, int64_t n,
            std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
            cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
@@ -143,7 +133,6 @@ void zhemm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, in
            cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
            cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
 
-// syrk
 void ssyrk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k,
            float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda, float beta,
            cl::sycl::buffer<float, 1> &c, int64_t ldc);
@@ -160,8 +149,6 @@ void zsyrk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, in
            std::complex<double> alpha, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
            std::complex<double> beta, cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
 
-// herk
-
 void cherk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k,
            float alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda, float beta,
            cl::sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
@@ -170,8 +157,6 @@ void zherk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, in
            double alpha, cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda, double beta,
            cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
 
-// syr2k
-
 void ssyr2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k,
             float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &b,
             int64_t ldb, float beta, cl::sycl::buffer<float, 1> &c, int64_t ldc);
@@ -191,8 +176,6 @@ void zsyr2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, i
             cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
             cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
 
-// her2k
-
 void cher2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k,
             std::complex<float> alpha, cl::sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
             cl::sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, float beta,
@@ -203,8 +186,6 @@ void zher2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, i
             cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, double beta,
             cl::sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
 
-// trmm
-
 void strmm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
            MKL_DIAG unit_diag, int64_t m, int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
            int64_t lda, cl::sycl::buffer<float, 1> &b, int64_t ldb);
@@ -223,7 +204,6 @@ void ztrmm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MK
            cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
            cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb);
 
-// trsm
 void strsm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
            MKL_DIAG unit_diag, int64_t m, int64_t n, float alpha, cl::sycl::buffer<float, 1> &a,
            int64_t lda, cl::sycl::buffer<float, 1> &b, int64_t ldb);
@@ -242,8 +222,6 @@ void ztrsm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MK
            cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
            cl::sycl::buffer<std::complex<double>, 1> &b, int64_t ldb);
 
-// Level2
-
 void sgemv(cl::sycl::queue &queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, float alpha,
            cl::sycl::buffer<float, 1> &a, int64_t lda, cl::sycl::buffer<float, 1> &x, int64_t incx,
            float beta, cl::sycl::buffer<float, 1> &y, int64_t incy);
@@ -519,8 +497,6 @@ void ztrsv(cl::sycl::queue &queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, MKL_DIAG
            cl::sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
            cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
 
-// Level1
-
 void scasum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
             int64_t incx, cl::sycl::buffer<float, 1> &result);
 
@@ -680,89 +656,6 @@ void idamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, i
 void icamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
             int64_t incx, cl::sycl::buffer<int64_t, 1> &result);
 
-void dnrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-           cl::sycl::buffer<double, 1> &result);
-
-void csrot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-           int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy, float c,
-           float s);
-
-void zdrot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-           int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy, double c,
-           double s);
-
-void srot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-          cl::sycl::buffer<float, 1> &y, int64_t incy, float c, float s);
-
-void drot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-          cl::sycl::buffer<double, 1> &y, int64_t incy, double c, double s);
-
-void srotg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<float, 1> &b,
-           cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<float, 1> &s);
-
-void drotg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<double, 1> &b,
-           cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<double, 1> &s);
-
-void crotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<float>, 1> &a,
-           cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<float, 1> &c,
-           cl::sycl::buffer<std::complex<float>, 1> &s);
-
-void zrotg(cl::sycl::queue &queue, cl::sycl::buffer<std::complex<double>, 1> &a,
-           cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<double, 1> &c,
-           cl::sycl::buffer<std::complex<double>, 1> &s);
-
-void srotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-           cl::sycl::buffer<float, 1> &y, int64_t incy, cl::sycl::buffer<float, 1> &param);
-
-void drotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-           cl::sycl::buffer<double, 1> &y, int64_t incy, cl::sycl::buffer<double, 1> &param);
-
-void srotmg(cl::sycl::queue &queue, cl::sycl::buffer<float, 1> &d1, cl::sycl::buffer<float, 1> &d2,
-            cl::sycl::buffer<float, 1> &x1, float y1, cl::sycl::buffer<float, 1> &param);
-
-void drotmg(cl::sycl::queue &queue, cl::sycl::buffer<double, 1> &d1,
-            cl::sycl::buffer<double, 1> &d2, cl::sycl::buffer<double, 1> &x1, double y1,
-            cl::sycl::buffer<double, 1> &param);
-
-void sscal(cl::sycl::queue &queue, int64_t n, float alpha, cl::sycl::buffer<float, 1> &x,
-           int64_t incx);
-
-void dscal(cl::sycl::queue &queue, int64_t n, double alpha, cl::sycl::buffer<double, 1> &x,
-           int64_t incx);
-
-void cscal(cl::sycl::queue &queue, int64_t n, std::complex<float> alpha,
-           cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx);
-
-void zscal(cl::sycl::queue &queue, int64_t n, std::complex<double> alpha,
-           cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
-
-void csscal(cl::sycl::queue &queue, int64_t n, float alpha,
-            cl::sycl::buffer<std::complex<float>, 1> &x, int64_t incx);
-
-void zdscal(cl::sycl::queue &queue, int64_t n, double alpha,
-            cl::sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
-
-void sswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-           cl::sycl::buffer<float, 1> &y, int64_t incy);
-
-void dswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-           cl::sycl::buffer<double, 1> &y, int64_t incy);
-
-void cswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-           int64_t incx, cl::sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void zswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
-           int64_t incx, cl::sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void isamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<float, 1> &x, int64_t incx,
-            cl::sycl::buffer<int64_t, 1> &result);
-
-void idamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<double, 1> &x, int64_t incx,
-            cl::sycl::buffer<int64_t, 1> &result);
-
-void icamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<float>, 1> &x,
-            int64_t incx, cl::sycl::buffer<int64_t, 1> &result);
-
 void izamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
             int64_t incx, cl::sycl::buffer<int64_t, 1> &result);
 
@@ -778,8 +671,6 @@ void icamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<flo
 void izamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<std::complex<double>, 1> &x,
             int64_t incx, cl::sycl::buffer<int64_t, 1> &result);
 
-// batch api
-
 void sgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t m,
                  int64_t n, int64_t k, float alpha, cl::sycl::buffer<float, 1> &a, int64_t lda,
                  int64_t stride_a, cl::sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
@@ -836,8 +727,6 @@ void ztrsm_batch(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_low
                  int64_t ldb, int64_t stride_b, int64_t batch_size, int64_t offset_a = 0,
                  int64_t offset_b = 0);
 
-// BLAS like extension
-
 void sgemmt(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
             MKL_TRANSPOSE transb, int64_t n, int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
             int64_t lda, cl::sycl::buffer<float, 1> &b, int64_t ldb, float beta,
@@ -875,6 +764,869 @@ void gemm_s8u8s32(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE tr
                   cl::sycl::buffer<int8_t, 1> &a, int64_t lda, int8_t ao,
                   cl::sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
                   cl::sycl::buffer<int32_t, 1> &c, int64_t ldc, cl::sycl::buffer<int32_t, 1> &co);
+
+// USM APIs
+
+cl::sycl::event sgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                           int64_t m, int64_t n, int64_t k, float alpha, const float *a,
+                           int64_t lda, const float *b, int64_t ldb, float beta, float *c,
+                           int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event dgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                           int64_t m, int64_t n, int64_t k, double alpha, const double *a,
+                           int64_t lda, const double *b, int64_t ldb, double beta, double *c,
+                           int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event cgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                           int64_t m, int64_t n, int64_t k, std::complex<float> alpha,
+                           const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
+                           int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                           int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event zgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                           int64_t m, int64_t n, int64_t k, std::complex<double> alpha,
+                           const std::complex<double> *a, int64_t lda,
+                           const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
+                           std::complex<double> *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event ssymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           int64_t m, int64_t n, float alpha, const float *a, int64_t lda,
+                           const float *b, int64_t ldb, float beta, float *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event dsymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           int64_t m, int64_t n, double alpha, const double *a, int64_t lda,
+                           const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event csymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           int64_t m, int64_t n, std::complex<float> alpha,
+                           const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
+                           int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                           int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event zsymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           int64_t m, int64_t n, std::complex<double> alpha,
+                           const std::complex<double> *a, int64_t lda,
+                           const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
+                           std::complex<double> *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event chemm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           int64_t m, int64_t n, std::complex<float> alpha,
+                           const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
+                           int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                           int64_t ldc, const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event zhemm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           int64_t m, int64_t n, std::complex<double> alpha,
+                           const std::complex<double> *a, int64_t lda,
+                           const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
+                           std::complex<double> *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event ssyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                           int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
+                           float beta, float *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_c = 0);
+
+cl::sycl::event dsyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                           int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
+                           double beta, double *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_c = 0);
+
+cl::sycl::event csyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                           int64_t n, int64_t k, std::complex<float> alpha,
+                           const std::complex<float> *a, int64_t lda, std::complex<float> beta,
+                           std::complex<float> *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_c = 0);
+
+cl::sycl::event zsyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                           int64_t n, int64_t k, std::complex<double> alpha,
+                           const std::complex<double> *a, int64_t lda, std::complex<double> beta,
+                           std::complex<double> *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_c = 0);
+
+cl::sycl::event cherk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                           int64_t n, int64_t k, float alpha, const std::complex<float> *a,
+                           int64_t lda, float beta, std::complex<float> *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_c = 0);
+
+cl::sycl::event zherk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                           int64_t n, int64_t k, double alpha, const std::complex<double> *a,
+                           int64_t lda, double beta, std::complex<double> *c, int64_t ldc,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_c = 0);
+
+cl::sycl::event ssyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                            int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
+                            const float *b, int64_t ldb, float beta, float *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event dsyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                            int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
+                            const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event csyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                            int64_t n, int64_t k, std::complex<float> alpha,
+                            const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
+                            int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                            int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event zsyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                            int64_t n, int64_t k, std::complex<double> alpha,
+                            const std::complex<double> *a, int64_t lda,
+                            const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
+                            std::complex<double> *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event cher2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                            int64_t n, int64_t k, std::complex<float> alpha,
+                            const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
+                            int64_t ldb, float beta, std::complex<float> *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event zher2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans,
+                            int64_t n, int64_t k, std::complex<double> alpha,
+                            const std::complex<double> *a, int64_t lda,
+                            const std::complex<double> *b, int64_t ldb, double beta,
+                            std::complex<double> *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event strmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           float alpha, const float *a, int64_t lda, float *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event dtrmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           double alpha, const double *a, int64_t lda, double *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event ctrmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
+                           std::complex<float> *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event ztrmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                           std::complex<double> *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event strsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           float alpha, const float *a, int64_t lda, float *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event dtrsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           double alpha, const double *a, int64_t lda, double *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event ctrsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
+                           std::complex<float> *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event ztrsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower,
+                           MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n,
+                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                           std::complex<double> *b, int64_t ldb,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                           int64_t offset_a = 0, int64_t offset_b = 0);
+
+cl::sycl::event sgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           float alpha, const float *a, int64_t lda, const float *x, int64_t incx,
+                           float beta, float *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           double alpha, const double *a, int64_t lda, const double *x,
+                           int64_t incx, double beta, double *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
+                           const std::complex<float> *x, int64_t incx, std::complex<float> beta,
+                           std::complex<float> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                           const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                           std::complex<double> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           int64_t kl, int64_t ku, float alpha, const float *a, int64_t lda,
+                           const float *x, int64_t incx, float beta, float *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           int64_t kl, int64_t ku, double alpha, const double *a, int64_t lda,
+                           const double *x, int64_t incx, double beta, double *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           int64_t kl, int64_t ku, std::complex<float> alpha,
+                           const std::complex<float> *a, int64_t lda, const std::complex<float> *x,
+                           int64_t incx, std::complex<float> beta, std::complex<float> *y,
+                           int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n,
+                           int64_t kl, int64_t ku, std::complex<double> alpha,
+                           const std::complex<double> *a, int64_t lda,
+                           const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                           std::complex<double> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sger_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, float alpha, const float *x,
+                          int64_t incx, const float *y, int64_t incy, float *a, int64_t lda,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dger_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, double alpha,
+                          const double *x, int64_t incx, const double *y, int64_t incy, double *a,
+                          int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cgerc_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex<float> alpha,
+                           const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
+                           int64_t incy, std::complex<float> *a, int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zgerc_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex<double> alpha,
+                           const std::complex<double> *x, int64_t incx,
+                           const std::complex<double> *y, int64_t incy, std::complex<double> *a,
+                           int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cgeru_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex<float> alpha,
+                           const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
+                           int64_t incy, std::complex<float> *a, int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zgeru_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex<double> alpha,
+                           const std::complex<double> *x, int64_t incx,
+                           const std::complex<double> *y, int64_t incy, std::complex<double> *a,
+                           int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event chbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k,
+                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
+                           const std::complex<float> *x, int64_t incx, std::complex<float> beta,
+                           std::complex<float> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zhbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k,
+                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                           const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                           std::complex<double> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event chemv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n,
+                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
+                           const std::complex<float> *x, int64_t incx, std::complex<float> beta,
+                           std::complex<float> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zhemv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n,
+                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
+                           const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                           std::complex<double> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cher_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha,
+                          const std::complex<float> *x, int64_t incx, std::complex<float> *a,
+                          int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zher_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha,
+                          const std::complex<double> *x, int64_t incx, std::complex<double> *a,
+                          int64_t lda, const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cher2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n,
+                           std::complex<float> alpha, const std::complex<float> *x, int64_t incx,
+                           const std::complex<float> *y, int64_t incy, std::complex<float> *a,
+                           int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zher2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n,
+                           std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
+                           const std::complex<double> *y, int64_t incy, std::complex<double> *a,
+                           int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event chpmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n,
+                           std::complex<float> alpha, const std::complex<float> *a,
+                           const std::complex<float> *x, int64_t incx, std::complex<float> beta,
+                           std::complex<float> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zhpmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n,
+                           std::complex<double> alpha, const std::complex<double> *a,
+                           const std::complex<double> *x, int64_t incx, std::complex<double> beta,
+                           std::complex<double> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event chpr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha,
+                          const std::complex<float> *x, int64_t incx, std::complex<float> *a,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zhpr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha,
+                          const std::complex<double> *x, int64_t incx, std::complex<double> *a,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event chpr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n,
+                           std::complex<float> alpha, const std::complex<float> *x, int64_t incx,
+                           const std::complex<float> *y, int64_t incy, std::complex<float> *a,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zhpr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n,
+                           std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
+                           const std::complex<double> *y, int64_t incy, std::complex<double> *a,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ssbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k, float alpha,
+                           const float *a, int64_t lda, const float *x, int64_t incx, float beta,
+                           float *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dsbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k,
+                           double alpha, const double *a, int64_t lda, const double *x,
+                           int64_t incx, double beta, double *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sspmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, float alpha,
+                           const float *a, const float *x, int64_t incx, float beta, float *y,
+                           int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dspmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, double alpha,
+                           const double *a, const double *x, int64_t incx, double beta, double *y,
+                           int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sspr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha,
+                          const float *x, int64_t incx, float *a,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dspr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha,
+                          const double *x, int64_t incx, double *a,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sspr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha,
+                           const float *x, int64_t incx, const float *y, int64_t incy, float *a,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dspr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha,
+                           const double *x, int64_t incx, const double *y, int64_t incy, double *a,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ssymv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, float alpha,
+                           const float *a, int64_t lda, const float *x, int64_t incx, float beta,
+                           float *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dsymv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, double alpha,
+                           const double *a, int64_t lda, const double *x, int64_t incx, double beta,
+                           double *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ssyr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha,
+                          const float *x, int64_t incx, float *a, int64_t lda,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dsyr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha,
+                          const double *x, int64_t incx, double *a, int64_t lda,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ssyr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha,
+                           const float *x, int64_t incx, const float *y, int64_t incy, float *a,
+                           int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dsyr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha,
+                           const double *x, int64_t incx, const double *y, int64_t incy, double *a,
+                           int64_t lda,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event stbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const float *a, int64_t lda,
+                           float *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dtbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const double *a, int64_t lda,
+                           double *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ctbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const std::complex<float> *a,
+                           int64_t lda, std::complex<float> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ztbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const std::complex<double> *a,
+                           int64_t lda, std::complex<double> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event stbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const float *a, int64_t lda,
+                           float *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dtbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const double *a, int64_t lda,
+                           double *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ctbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const std::complex<float> *a,
+                           int64_t lda, std::complex<float> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ztbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, int64_t k, const std::complex<double> *a,
+                           int64_t lda, std::complex<double> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event stpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const float *a, float *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dtpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const double *a, double *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ctpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<float> *a,
+                           std::complex<float> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ztpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<double> *a,
+                           std::complex<double> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event stpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const float *a, float *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dtpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const double *a, double *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ctpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<float> *a,
+                           std::complex<float> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ztpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<double> *a,
+                           std::complex<double> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event strmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const float *a, int64_t lda, float *x,
+                           int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dtrmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const double *a, int64_t lda, double *x,
+                           int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ctrmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<float> *a, int64_t lda,
+                           std::complex<float> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ztrmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<double> *a, int64_t lda,
+                           std::complex<double> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event strsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const float *a, int64_t lda, float *x,
+                           int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dtrsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const double *a, int64_t lda, double *x,
+                           int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ctrsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<float> *a, int64_t lda,
+                           std::complex<float> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ztrsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans,
+                           MKL_DIAG diag, int64_t n, const std::complex<double> *a, int64_t lda,
+                           std::complex<double> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event scasum_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<float> *x,
+                            int64_t incx, float *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dzasum_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<double> *x,
+                            int64_t incx, double *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sasum_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx,
+                           float *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dasum_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx,
+                           double *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event saxpy_sycl(cl::sycl::queue *queue, int64_t n, float alpha, const float *x,
+                           int64_t incx, float *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event daxpy_sycl(cl::sycl::queue *queue, int64_t n, double alpha, const double *x,
+                           int64_t incx, double *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event caxpy_sycl(cl::sycl::queue *queue, int64_t n, std::complex<float> alpha,
+                           const std::complex<float> *x, int64_t incx, std::complex<float> *y,
+                           int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zaxpy_sycl(cl::sycl::queue *queue, int64_t n, std::complex<double> alpha,
+                           const std::complex<double> *x, int64_t incx, std::complex<double> *y,
+                           int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event scopy_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx,
+                           float *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dcopy_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx,
+                           double *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ccopy_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<float> *x,
+                           int64_t incx, std::complex<float> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zcopy_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<double> *x,
+                           int64_t incx, std::complex<double> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sdot_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx,
+                          const float *y, int64_t incy, float *result,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event ddot_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx,
+                          const double *y, int64_t incy, double *result,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sdsdot_sycl(cl::sycl::queue *queue, int64_t n, float sb, const float *x,
+                            int64_t incx, const float *y, int64_t incy, float *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dsdot_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx,
+                           const float *y, int64_t incy, double *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cdotc_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<float> *x,
+                           int64_t incx, const std::complex<float> *y, int64_t incy,
+                           std::complex<float> *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zdotc_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<double> *x,
+                           int64_t incx, const std::complex<double> *y, int64_t incy,
+                           std::complex<double> *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cdotu_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<float> *x,
+                           int64_t incx, const std::complex<float> *y, int64_t incy,
+                           std::complex<float> *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zdotu_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<double> *x,
+                           int64_t incx, const std::complex<double> *y, int64_t incy,
+                           std::complex<double> *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event scnrm2_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<float> *x,
+                            int64_t incx, float *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dznrm2_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<double> *x,
+                            int64_t incx, double *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event snrm2_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx,
+                           float *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dnrm2_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx,
+                           double *result,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event csrot_sycl(cl::sycl::queue *queue, int64_t n, std::complex<float> *x, int64_t incx,
+                           std::complex<float> *y, int64_t incy, float c, float s,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zdrot_sycl(cl::sycl::queue *queue, int64_t n, std::complex<double> *x, int64_t incx,
+                           std::complex<double> *y, int64_t incy, double c, double s,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event srot_sycl(cl::sycl::queue *queue, int64_t n, float *x, int64_t incx, float *y,
+                          int64_t incy, float c, float s,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event drot_sycl(cl::sycl::queue *queue, int64_t n, double *x, int64_t incx, double *y,
+                          int64_t incy, double c, double s,
+                          const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event srotg_sycl(cl::sycl::queue *queue, float *a, float *b, float *c, float *s,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event drotg_sycl(cl::sycl::queue *queue, double *a, double *b, double *c, double *s,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event crotg_sycl(cl::sycl::queue *queue, std::complex<float> *a, std::complex<float> *b,
+                           float *c, std::complex<float> *s,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zrotg_sycl(cl::sycl::queue *queue, std::complex<double> *a, std::complex<double> *b,
+                           double *c, std::complex<double> *s,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event srotm_sycl(cl::sycl::queue *queue, int64_t n, float *x, int64_t incx, float *y,
+                           int64_t incy, float *param,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event drotm_sycl(cl::sycl::queue *queue, int64_t n, double *x, int64_t incx, double *y,
+                           int64_t incy, double *param,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event srotmg_sycl(cl::sycl::queue *queue, float *d1, float *d2, float *x1, float y1,
+                            float *param,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event drotmg_sycl(cl::sycl::queue *queue, double *d1, double *d2, double *x1, double y1,
+                            double *param,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sscal_sycl(cl::sycl::queue *queue, int64_t n, float alpha, float *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dscal_sycl(cl::sycl::queue *queue, int64_t n, double alpha, double *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cscal_sycl(cl::sycl::queue *queue, int64_t n, std::complex<float> alpha,
+                           std::complex<float> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zscal_sycl(cl::sycl::queue *queue, int64_t n, std::complex<double> alpha,
+                           std::complex<double> *x, int64_t incx,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event csscal_sycl(cl::sycl::queue *queue, int64_t n, float alpha, std::complex<float> *x,
+                            int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zdscal_sycl(cl::sycl::queue *queue, int64_t n, double alpha,
+                            std::complex<double> *x, int64_t incx,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sswap_sycl(cl::sycl::queue *queue, int64_t n, float *x, int64_t incx, float *y,
+                           int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dswap_sycl(cl::sycl::queue *queue, int64_t n, double *x, int64_t incx, double *y,
+                           int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cswap_sycl(cl::sycl::queue *queue, int64_t n, std::complex<float> *x, int64_t incx,
+                           std::complex<float> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zswap_sycl(cl::sycl::queue *queue, int64_t n, std::complex<double> *x, int64_t incx,
+                           std::complex<double> *y, int64_t incy,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event isamax_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx,
+                            int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event idamax_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx,
+                            int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event icamax_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<float> *x,
+                            int64_t incx, int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event izamax_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<double> *x,
+                            int64_t incx, int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event isamin_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx,
+                            int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event idamin_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx,
+                            int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event icamin_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<float> *x,
+                            int64_t incx, int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event izamin_sycl(cl::sycl::queue *queue, int64_t n, const std::complex<double> *x,
+                            int64_t incx, int64_t *result,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, float alpha, const float *a,
+                            int64_t lda, int64_t strideA, const float *b, int64_t ldb,
+                            int64_t strideB, float beta, float *c, int64_t ldc, int64_t strideC,
+                            int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, double alpha, const double *a,
+                            int64_t lda, int64_t strideA, const double *b, int64_t ldb,
+                            int64_t strideB, double beta, double *c, int64_t ldc, int64_t strideC,
+                            int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, std::complex<float> alpha,
+                            const std::complex<float> *a, int64_t lda, int64_t strideA,
+                            const std::complex<float> *b, int64_t ldb, int64_t strideB,
+                            std::complex<float> beta, std::complex<float> *c, int64_t ldc,
+                            int64_t strideC, int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, std::complex<double> alpha,
+                            const std::complex<double> *a, int64_t lda, int64_t strideA,
+                            const std::complex<double> *b, int64_t ldb, int64_t strideB,
+                            std::complex<double> beta, std::complex<double> *c, int64_t ldc,
+                            int64_t strideC, int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, float alpha, const float **a,
+                            int64_t lda, const float **b, int64_t ldb, float beta, float **c,
+                            int64_t ldc, int64_t offset_batch, int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event dgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, double alpha, const double **a,
+                            int64_t lda, const double **b, int64_t ldb, double beta, double **c,
+                            int64_t ldc, int64_t offset_batch, int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event cgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, std::complex<float> alpha,
+                            const std::complex<float> **a, int64_t lda,
+                            const std::complex<float> **b, int64_t ldb, std::complex<float> beta,
+                            std::complex<float> **c, int64_t ldc, int64_t offset_batch,
+                            int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb,
+                            int64_t m, int64_t n, int64_t k, std::complex<double> alpha,
+                            const std::complex<double> **a, int64_t lda,
+                            const std::complex<double> **b, int64_t ldb, std::complex<double> beta,
+                            std::complex<double> **c, int64_t ldc, int64_t offset_batch,
+                            int64_t group_size,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event saxpy_batch(cl::sycl::queue &queue, std::int64_t n, float alpha, const float **x,
+                            std::int64_t incx, float **y, std::int64_t incy,
+                            std::int64_t batch_size, std::int64_t offset,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event daxpy_batch(cl::sycl::queue &queue, std::int64_t n, double alpha, const double **x,
+                            std::int64_t incx, double **y, std::int64_t incy,
+                            std::int64_t batch_size, std::int64_t offset,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event caxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
+                            const std::complex<float> **x, std::int64_t incx,
+                            std::complex<float> **y, std::int64_t incy, std::int64_t batch_size,
+                            std::int64_t offset,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zaxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
+                            const std::complex<double> **x, std::int64_t incx,
+                            std::complex<double> **y, std::int64_t incy, std::int64_t batch_size,
+                            std::int64_t offset,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event sgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
+                            MKL_TRANSPOSE transb, int64_t n, int64_t k, float alpha, const float *a,
+                            int64_t lda, const float *b, int64_t ldb, float beta, float *c,
+                            int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event dgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
+                            MKL_TRANSPOSE transb, int64_t n, int64_t k, double alpha,
+                            const double *a, int64_t lda, const double *b, int64_t ldb, double beta,
+                            double *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event zgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
+                            MKL_TRANSPOSE transb, int64_t n, int64_t k, std::complex<double> alpha,
+                            const std::complex<double> *a, int64_t lda,
+                            const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
+                            std::complex<double> *c, int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
+cl::sycl::event cgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
+                            MKL_TRANSPOSE transb, int64_t n, int64_t k, std::complex<float> alpha,
+                            const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
+                            int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                            int64_t ldc,
+                            const cl::sycl::vector_class<cl::sycl::event> &dependencies,
+                            int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0);
+
 } // namespace gpu
 } // namespace mkl
 #endif //_MKL_INTERNAL_BLAS_SYCL_GPU_HPP_
diff --git a/src/blas/blas_loader.cpp b/src/blas/blas_loader.cpp
index e1dadc8be..045f43a27 100644
--- a/src/blas/blas_loader.cpp
+++ b/src/blas/blas_loader.cpp
@@ -24,6 +24,8 @@ namespace onemkl {
 namespace blas {
 namespace detail {
 
+// Buffer APIs
+
 void asum(char *libname, cl::sycl::queue &queue, std::int64_t n,
           cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
           cl::sycl::buffer<float, 1> &result) {
@@ -1029,56 +1031,6 @@ void trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_low
                                         alpha, a, lda, b, ldb);
 }
 
-void gemm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-                cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-                cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-                cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].sgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc, group_count, group_size);
-}
-
-void gemm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-                cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-                cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-                cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].dgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc, group_count, group_size);
-}
-
-void gemm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-                cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                cl::sycl::buffer<std::complex<float>, 1> &beta,
-                cl::sycl::buffer<std::complex<float>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].cgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc, group_count, group_size);
-}
-
-void gemm_batch(
-    char *libname, cl::sycl::queue &queue, cl::sycl::buffer<transpose, 1> &transa,
-    cl::sycl::buffer<transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-    cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-    cl::sycl::buffer<std::complex<double>, 1> &alpha, cl::sycl::buffer<std::complex<double>, 1> &a,
-    cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<std::complex<double>, 1> &b,
-    cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<std::complex<double>, 1> &beta,
-    cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-    std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].zgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc, group_count, group_size);
-}
-
 void gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb,
                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
                 cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
@@ -1125,58 +1077,6 @@ void gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, transpo
                                                       stride_c, batch_size);
 }
 
-void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<float, 1> &alpha,
-                cl::sycl::buffer<float, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<float, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].strsm_batch_group_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                    group_count, group_size);
-}
-
-void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<double, 1> &alpha,
-                cl::sycl::buffer<double, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<double, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].dtrsm_batch_group_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                    group_count, group_size);
-}
-
-void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::complex<float>, 1> &alpha,
-                cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-                std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].ctrsm_batch_group_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                    group_count, group_size);
-}
-
-void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer<side, 1> &left_right,
-                cl::sycl::buffer<uplo, 1> &upper_lower, cl::sycl::buffer<transpose, 1> &trans,
-                cl::sycl::buffer<diag, 1> &unit_diag, cl::sycl::buffer<std::int64_t, 1> &m,
-                cl::sycl::buffer<std::int64_t, 1> &n,
-                cl::sycl::buffer<std::complex<double>, 1> &alpha,
-                cl::sycl::buffer<std::complex<double>, 1> &a,
-                cl::sycl::buffer<std::int64_t, 1> &lda,
-                cl::sycl::buffer<std::complex<double>, 1> &b,
-                cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-                cl::sycl::buffer<std::int64_t, 1> &group_size) {
-    function_tables[libname].ztrsm_batch_group_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                    group_count, group_size);
-}
-
 void trsm_batch(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
                 cl::sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
@@ -1315,6 +1215,1321 @@ void gemm_ext(char *libname, cl::sycl::queue &queue, transpose transa, transpose
                                             beta, c, ldc);
 }
 
+// USM APIs
+
+cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<float> *x, std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].scasum_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<double> *x, std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dzasum_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sasum_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dasum_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha,
+                     const float *x, std::int64_t incx, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].saxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha,
+                     const double *x, std::int64_t incx, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].daxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].caxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zaxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n, float *alpha,
+                           const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].saxpy_batch_group_usm_sycl(
+        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
+}
+
+cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n, double *alpha,
+                           const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].daxpy_batch_group_usm_sycl(
+        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
+}
+
+cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n,
+                           std::complex<float> *alpha, const std::complex<float> **x,
+                           std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].caxpy_batch_group_usm_sycl(
+        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
+}
+
+cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n,
+                           std::complex<double> *alpha, const std::complex<double> **x,
+                           std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zaxpy_batch_group_usm_sycl(
+        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
+}
+
+cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x,
+                     std::int64_t incx, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].scopy_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x,
+                     std::int64_t incx, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ccopy_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x,
+                    std::int64_t incx, const float *y, std::int64_t incy, float *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sdot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x,
+                    std::int64_t incx, const double *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ddot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies);
+}
+
+cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x,
+                    std::int64_t incx, const float *y, std::int64_t incy, double *result,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsdot_usm_sycl(queue, n, x, incx, y, incy, result,
+                                                   dependencies);
+}
+
+cl::sycl::event dotc(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+                     std::int64_t incy, std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cdotc_usm_sycl(queue, n, x, incx, y, incy, result,
+                                                   dependencies);
+}
+
+cl::sycl::event dotc(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zdotc_usm_sycl(queue, n, x, incx, y, incy, result,
+                                                   dependencies);
+}
+
+cl::sycl::event dotu(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
+                     std::int64_t incy, std::complex<float> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cdotu_usm_sycl(queue, n, x, incx, y, incy, result,
+                                                   dependencies);
+}
+
+cl::sycl::event dotu(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zdotu_usm_sycl(queue, n, x, incx, y, incy, result,
+                                                   dependencies);
+}
+
+cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].isamin_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].idamin_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].icamin_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].izamin_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].isamax_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x,
+                      std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].idamax_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].icamax_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].izamax_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<float> *x, std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].snrm2_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     const std::complex<double> *x, std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dnrm2_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x,
+                     std::int64_t incx, float *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].scnrm2_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x,
+                     std::int64_t incx, double *result,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dznrm2_usm_sycl(queue, n, x, incx, result, dependencies);
+}
+
+cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                    std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].srot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                    std::int64_t incx, std::complex<double> *y, std::int64_t incy, double c,
+                    double s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].drot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x,
+                    std::int64_t incx, float *y, std::int64_t incy, float c, float s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].csrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x,
+                    std::int64_t incx, double *y, std::int64_t incy, double c, double s,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zdrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies);
+}
+
+cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, float *a, float *b, float *c, float *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].srotg_usm_sycl(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, double *a, double *b, double *c,
+                     double *s, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].drotg_usm_sycl(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, std::complex<float> *a,
+                     std::complex<float> *b, float *c, std::complex<float> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].crotg_usm_sycl(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, std::complex<double> *a,
+                     std::complex<double> *b, double *c, std::complex<double> *s,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zrotg_usm_sycl(queue, a, b, c, s, dependencies);
+}
+
+cl::sycl::event rotm(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x,
+                     std::int64_t incx, float *y, std::int64_t incy, float *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].srotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies);
+}
+
+cl::sycl::event rotm(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x,
+                     std::int64_t incx, double *y, std::int64_t incy, double *param,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].drotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies);
+}
+
+cl::sycl::event rotmg(char *libname, cl::sycl::queue &queue, float *d1, float *d2, float *x1,
+                      float y1, float *param,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].srotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies);
+}
+
+cl::sycl::event rotmg(char *libname, cl::sycl::queue &queue, double *d1, double *d2, double *x1,
+                      double y1, double *param,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].drotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies);
+}
+
+cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n,
+                     std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].csscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zdscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
+}
+
+cl::sycl::event sdsdot(char *libname, cl::sycl::queue &queue, std::int64_t n, float sb,
+                       const float *x, std::int64_t incx, const float *y, std::int64_t incy,
+                       float *result, const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sdsdot_usm_sycl(queue, n, sb, x, incx, y, incy, result,
+                                                    dependencies);
+}
+
+cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x,
+                     std::int64_t incx, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sswap_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x,
+                     std::int64_t incx, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dswap_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cswap_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zswap_usm_sycl(queue, n, x, incx, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
+                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                                                   incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
+                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
+                     double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                                                   incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
+                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                                                   incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
+                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
+                     std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
+                                                   incx, beta, y, incy, dependencies);
+}
+
+cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
+                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m,
+                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                    float alpha, const float *x, std::int64_t incx, const float *y,
+                    std::int64_t incy, float *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                                  dependencies);
+}
+
+cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                    double alpha, const double *x, std::int64_t incx, const double *y,
+                    std::int64_t incy, double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                                  dependencies);
+}
+
+cl::sycl::event gerc(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                                   dependencies);
+}
+
+cl::sycl::event gerc(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                                   dependencies);
+}
+
+cl::sycl::event geru(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                                   dependencies);
+}
+
+cl::sycl::event geru(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda,
+                                                   dependencies);
+}
+
+cl::sycl::event hbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
+                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].chbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event hbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
+                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zhbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event hemv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].chemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event hemv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zhemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    float alpha, const std::complex<float> *x, std::int64_t incx,
+                    std::complex<float> *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                                  dependencies);
+}
+
+cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    double alpha, const std::complex<double> *x, std::int64_t incx,
+                    std::complex<double> *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                                  dependencies);
+}
+
+cl::sycl::event her2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, lda, dependencies);
+}
+
+cl::sycl::event her2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, lda, dependencies);
+}
+
+cl::sycl::event hpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a,
+                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
+                     std::complex<float> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].chpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event hpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a,
+                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
+                     std::complex<double> *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zhpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    float alpha, const std::complex<float> *x, std::int64_t incx,
+                    std::complex<float> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].chpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
+                                                  dependencies);
+}
+
+cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    double alpha, const std::complex<double> *x, std::int64_t incx,
+                    std::complex<double> *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zhpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
+                                                  dependencies);
+}
+
+cl::sycl::event hpr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
+                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].chpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, dependencies);
+}
+
+cl::sycl::event hpr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
+                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zhpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, dependencies);
+}
+
+cl::sycl::event sbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ssbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event sbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     std::int64_t k, double alpha, const double *a, std::int64_t lda,
+                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event spmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     float alpha, const float *a, const float *x, std::int64_t incx, float beta,
+                     float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event spmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     double alpha, const double *a, const double *x, std::int64_t incx, double beta,
+                     double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
+                                                   y, incy, dependencies);
+}
+
+cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    float alpha, const float *x, std::int64_t incx, float *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
+                                                  dependencies);
+}
+
+cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    double alpha, const double *x, std::int64_t incx, double *a,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
+                                                  dependencies);
+}
+
+cl::sycl::event spr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     float alpha, const float *x, std::int64_t incx, const float *y,
+                     std::int64_t incy, float *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, dependencies);
+}
+
+cl::sycl::event spr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     double alpha, const double *x, std::int64_t incx, const double *y,
+                     std::int64_t incy, double *a,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, dependencies);
+}
+
+cl::sycl::event symv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     float alpha, const float *a, std::int64_t lda, const float *x,
+                     std::int64_t incx, float beta, float *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ssymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event symv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     double alpha, const double *a, std::int64_t lda, const double *x,
+                     std::int64_t incx, double beta, double *y, std::int64_t incy,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
+                                                   beta, y, incy, dependencies);
+}
+
+cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ssyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                                  dependencies);
+}
+
+cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                    double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
+                    const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda,
+                                                  dependencies);
+}
+
+cl::sycl::event syr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     float alpha, const float *x, std::int64_t incx, const float *y,
+                     std::int64_t incy, float *a, std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ssyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, lda, dependencies);
+}
+
+cl::sycl::event syr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n,
+                     double alpha, const double *x, std::int64_t incx, const double *y,
+                     std::int64_t incy, double *a, std::int64_t lda,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
+                                                   a, lda, dependencies);
+}
+
+cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].stbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
+                     std::int64_t lda, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].stbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
+                     std::int64_t lda, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
+                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
+                                                   lda, x, incx, dependencies);
+}
+
+cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].stpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].stpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
+                                                   incx, dependencies);
+}
+
+cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].strmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].strsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
+                     std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
+                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
+                                                   x, incx, dependencies);
+}
+
+cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
+                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                   ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                   ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                   ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb,
+                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
+                                                   ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event hemm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].chemm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
+                                                   lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event hemm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zhemm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
+                                                   lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event herk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
+                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cherk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                   beta, c, ldc, dependencies);
+}
+
+cl::sycl::event herk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
+                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zherk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                   beta, c, ldc, dependencies);
+}
+
+cl::sycl::event her2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cher2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                    b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event her2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, double beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zher2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                    b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
+                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ssymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
+                                                   lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     std::int64_t m, std::int64_t n, double alpha, const double *a,
+                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
+                                                   lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].csymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
+                                                   lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
+                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
+                     std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zsymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
+                                                   lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                     float beta, float *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ssyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                   beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     std::int64_t n, std::int64_t k, double alpha, const double *a,
+                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                   beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
+                     std::complex<float> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].csyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                   beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
+                     std::complex<double> *c, std::int64_t ldc,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                   beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ssyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                    b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                      std::int64_t n, std::int64_t k, double alpha, const double *a,
+                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dsyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                    b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].csyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                    b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans,
+                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zsyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
+                                                    b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
+                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].strmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
+                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
+                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].strsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
+                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dtrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
+                     std::complex<float> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ctrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower,
+                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
+                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
+                     std::complex<double> *b, std::int64_t ldb,
+                     const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].ztrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag,
+                                                   m, n, alpha, a, lda, b, ldb, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa,
+                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           float *alpha, const float **a, std::int64_t *lda, const float **b,
+                           std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sgemm_batch_group_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
+        group_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa,
+                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           double *alpha, const double **a, std::int64_t *lda, const double **b,
+                           std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dgemm_batch_group_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
+        group_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa,
+                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           std::complex<float> *alpha, const std::complex<float> **a,
+                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
+                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgemm_batch_group_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
+        group_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa,
+                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
+                           std::complex<double> *alpha, const std::complex<double> **a,
+                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
+                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+                           std::int64_t group_count, std::int64_t *group_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgemm_batch_group_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
+        group_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa,
+                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
+                           const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
+                           float *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sgemm_batch_strided_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+        stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa,
+                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
+                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
+                           double *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dgemm_batch_strided_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+        stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa,
+                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<float> alpha, const std::complex<float> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
+                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgemm_batch_strided_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+        stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa,
+                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
+                           std::complex<double> alpha, const std::complex<double> *a,
+                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
+                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
+                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
+                           std::int64_t batch_size,
+                           const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgemm_batch_strided_usm_sycl(
+        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
+        stride_c, batch_size, dependencies);
+}
+
+cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                      transpose transb, std::int64_t n, std::int64_t k, float alpha, const float *a,
+                      std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].sgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha,
+                                                    a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                      transpose transb, std::int64_t n, std::int64_t k, double alpha,
+                      const double *a, std::int64_t lda, const double *b, std::int64_t ldb,
+                      double beta, double *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].dgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha,
+                                                    a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                      transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
+                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
+                      std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].cgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha,
+                                                    a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
+cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa,
+                      transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                      const std::complex<double> *a, std::int64_t lda,
+                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
+                      std::complex<double> *c, std::int64_t ldc,
+                      const cl::sycl::vector_class<cl::sycl::event> &dependencies) {
+    return function_tables[libname].zgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha,
+                                                    a, lda, b, ldb, beta, c, ldc, dependencies);
+}
+
 } /*namespace detail */
 } /* namespace blas */
 } /* namespace onemkl */
diff --git a/src/blas/function_table.hpp b/src/blas/function_table.hpp
index 69d9128b7..b6fa9ad66 100644
--- a/src/blas/function_table.hpp
+++ b/src/blas/function_table.hpp
@@ -27,6 +27,9 @@
 
 typedef struct {
     int version;
+
+    // Buffer APIs
+
     void (*scasum_sycl)(cl::sycl::queue &queue, std::int64_t n,
                         cl::sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
                         cl::sycl::buffer<float, 1> &result);
@@ -601,44 +604,6 @@ typedef struct {
                        std::int64_t n, std::complex<double> alpha,
                        cl::sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
                        cl::sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-    void (*sgemm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-        cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-        cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-        cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-        cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-        cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<float, 1> &beta,
-        cl::sycl::buffer<float, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-        std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-    void (*dgemm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-        cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-        cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-        cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-        cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-        cl::sycl::buffer<std::int64_t, 1> &ldb, cl::sycl::buffer<double, 1> &beta,
-        cl::sycl::buffer<double, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-        std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-    void (*cgemm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-        cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-        cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-        cl::sycl::buffer<std::complex<float>, 1> &alpha,
-        cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-        cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-        cl::sycl::buffer<std::complex<float>, 1> &beta, cl::sycl::buffer<std::complex<float>, 1> &c,
-        cl::sycl::buffer<std::int64_t, 1> &ldc, std::int64_t group_count,
-        cl::sycl::buffer<std::int64_t, 1> &group_size);
-    void (*zgemm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::transpose, 1> &transa,
-        cl::sycl::buffer<onemkl::transpose, 1> &transb, cl::sycl::buffer<std::int64_t, 1> &m,
-        cl::sycl::buffer<std::int64_t, 1> &n, cl::sycl::buffer<std::int64_t, 1> &k,
-        cl::sycl::buffer<std::complex<double>, 1> &alpha,
-        cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-        cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-        cl::sycl::buffer<std::complex<double>, 1> &beta,
-        cl::sycl::buffer<std::complex<double>, 1> &c, cl::sycl::buffer<std::int64_t, 1> &ldc,
-        std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
     void (*sgemm_batch_strided_sycl)(cl::sycl::queue &queue, onemkl::transpose transa,
                                      onemkl::transpose transb, std::int64_t m, std::int64_t n,
                                      std::int64_t k, float alpha, cl::sycl::buffer<float, 1> &a,
@@ -673,42 +638,6 @@ typedef struct {
                                      std::int64_t stride_b, std::complex<double> beta,
                                      cl::sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
                                      std::int64_t stride_c, std::int64_t batch_size);
-    void (*strsm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-        cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-        cl::sycl::buffer<onemkl::transpose, 1> &trans, cl::sycl::buffer<onemkl::diag, 1> &unit_diag,
-        cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-        cl::sycl::buffer<float, 1> &alpha, cl::sycl::buffer<float, 1> &a,
-        cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<float, 1> &b,
-        cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-        cl::sycl::buffer<std::int64_t, 1> &group_size);
-    void (*dtrsm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-        cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-        cl::sycl::buffer<onemkl::transpose, 1> &trans, cl::sycl::buffer<onemkl::diag, 1> &unit_diag,
-        cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-        cl::sycl::buffer<double, 1> &alpha, cl::sycl::buffer<double, 1> &a,
-        cl::sycl::buffer<std::int64_t, 1> &lda, cl::sycl::buffer<double, 1> &b,
-        cl::sycl::buffer<std::int64_t, 1> &ldb, std::int64_t group_count,
-        cl::sycl::buffer<std::int64_t, 1> &group_size);
-    void (*ctrsm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-        cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-        cl::sycl::buffer<onemkl::transpose, 1> &trans, cl::sycl::buffer<onemkl::diag, 1> &unit_diag,
-        cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-        cl::sycl::buffer<std::complex<float>, 1> &alpha,
-        cl::sycl::buffer<std::complex<float>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-        cl::sycl::buffer<std::complex<float>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-        std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
-    void (*ztrsm_batch_group_sycl)(
-        cl::sycl::queue &queue, cl::sycl::buffer<onemkl::side, 1> &left_right,
-        cl::sycl::buffer<onemkl::uplo, 1> &upper_lower,
-        cl::sycl::buffer<onemkl::transpose, 1> &trans, cl::sycl::buffer<onemkl::diag, 1> &unit_diag,
-        cl::sycl::buffer<std::int64_t, 1> &m, cl::sycl::buffer<std::int64_t, 1> &n,
-        cl::sycl::buffer<std::complex<double>, 1> &alpha,
-        cl::sycl::buffer<std::complex<double>, 1> &a, cl::sycl::buffer<std::int64_t, 1> &lda,
-        cl::sycl::buffer<std::complex<double>, 1> &b, cl::sycl::buffer<std::int64_t, 1> &ldb,
-        std::int64_t group_count, cl::sycl::buffer<std::int64_t, 1> &group_size);
     void (*strsm_batch_strided_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
                                      onemkl::uplo upper_lower, onemkl::transpose trans,
                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
@@ -801,6 +730,839 @@ typedef struct {
                            half alpha, cl::sycl::buffer<half, 1> &a, std::int64_t lda,
                            cl::sycl::buffer<half, 1> &b, std::int64_t ldb, half beta,
                            cl::sycl::buffer<half, 1> &c, std::int64_t ldc);
+
+    // USM APIs
+
+    cl::sycl::event (*scasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                       const std::complex<float> *x, std::int64_t incx,
+                                       float *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dzasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                       const std::complex<double> *x, std::int64_t incx,
+                                       double *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                      std::int64_t incx, float *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                      std::int64_t incx, double *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*saxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float alpha,
+                                      const float *x, std::int64_t incx, float *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*daxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                                      const double *x, std::int64_t incx, double *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*caxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      std::complex<float> alpha, const std::complex<float> *x,
+                                      std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zaxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      std::complex<double> alpha, const std::complex<double> *x,
+                                      std::int64_t incx, std::complex<double> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+    cl::sycl::event (*saxpy_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
+        float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+    cl::sycl::event (*daxpy_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
+        std::int64_t *incx, double **y, std::int64_t *incy, std::int64_t group_count,
+        std::int64_t *group_size, const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+    cl::sycl::event (*caxpy_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
+        const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y,
+        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+    cl::sycl::event (*zaxpy_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
+        const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
+        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+    cl::sycl::event (*scopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                      std::int64_t incx, float *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dcopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                      std::int64_t incx, double *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ccopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      std::complex<float> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zcopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      std::complex<double> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sdot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                     std::int64_t incx, const float *y, std::int64_t incy,
+                                     float *result,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ddot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                     std::int64_t incx, const double *y, std::int64_t incy,
+                                     double *result,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsdot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                      std::int64_t incx, const float *y, std::int64_t incy,
+                                      double *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cdotc_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      const std::complex<float> *y, std::int64_t incy,
+                                      std::complex<float> *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zdotc_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      const std::complex<double> *y, std::int64_t incy,
+                                      std::complex<double> *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cdotu_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      const std::complex<float> *y, std::int64_t incy,
+                                      std::complex<float> *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zdotu_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      const std::complex<double> *y, std::int64_t incy,
+                                      std::complex<double> *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*isamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                       std::int64_t incx, std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*idamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                       std::int64_t incx, std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*icamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                       const std::complex<float> *x, std::int64_t incx,
+                                       std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*izamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                       const std::complex<double> *x, std::int64_t incx,
+                                       std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*isamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                       std::int64_t incx, std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*idamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                       std::int64_t incx, std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*icamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                       const std::complex<float> *x, std::int64_t incx,
+                                       std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*izamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                       const std::complex<double> *x, std::int64_t incx,
+                                       std::int64_t *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*snrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      float *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dnrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      double *result,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*scnrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x,
+                                       std::int64_t incx, float *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dznrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x,
+                                       std::int64_t incx, double *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*srot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, std::complex<float> *x,
+                                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
+                                     float c, float s,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*drot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                     std::complex<double> *x, std::int64_t incx,
+                                     std::complex<double> *y, std::int64_t incy, double c, double s,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*csrot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float *x,
+                                      std::int64_t incx, float *y, std::int64_t incy, float c,
+                                      float s,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zdrot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double *x,
+                                      std::int64_t incx, double *y, std::int64_t incy, double c,
+                                      double s,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*srotg_usm_sycl)(cl::sycl::queue &queue, float *a, float *b, float *c,
+                                      float *s,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*drotg_usm_sycl)(cl::sycl::queue &queue, double *a, double *b, double *c,
+                                      double *s,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*crotg_usm_sycl)(cl::sycl::queue &queue, std::complex<float> *a,
+                                      std::complex<float> *b, float *c, std::complex<float> *s,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zrotg_usm_sycl)(cl::sycl::queue &queue, std::complex<double> *a,
+                                      std::complex<double> *b, double *c, std::complex<double> *s,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*srotm_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float *x,
+                                      std::int64_t incx, float *y, std::int64_t incy, float *param,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*drotm_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double *x,
+                                      std::int64_t incx, double *y, std::int64_t incy,
+                                      double *param,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*srotmg_usm_sycl)(cl::sycl::queue &queue, float *d1, float *d2, float *x1,
+                                       float y1, float *param,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*drotmg_usm_sycl)(cl::sycl::queue &queue, double *d1, double *d2, double *x1,
+                                       double y1, double *param,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x,
+                                      std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                                      double *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      std::complex<float> alpha, std::complex<float> *x,
+                                      std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*csscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                       std::complex<double> alpha, std::complex<double> *x,
+                                       std::int64_t incx,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float alpha,
+                                      std::complex<float> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zdscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double alpha,
+                                       std::complex<double> *x, std::int64_t incx,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sdsdot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float sb,
+                                       const float *x, std::int64_t incx, const float *y,
+                                       std::int64_t incy, float *result,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float *x,
+                                      std::int64_t incx, float *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double *x,
+                                      std::int64_t incx, double *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      std::complex<float> *x, std::int64_t incx,
+                                      std::complex<float> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n,
+                                      std::complex<double> *x, std::int64_t incx,
+                                      std::complex<double> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, std::int64_t kl,
+                                      std::int64_t ku, float alpha, const float *a,
+                                      std::int64_t lda, const float *x, std::int64_t incx,
+                                      float beta, float *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, std::int64_t kl,
+                                      std::int64_t ku, double alpha, const double *a,
+                                      std::int64_t lda, const double *x, std::int64_t incx,
+                                      double beta, double *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, std::int64_t kl,
+                                      std::int64_t ku, std::complex<float> alpha,
+                                      const std::complex<float> *a, std::int64_t lda,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      std::complex<float> beta, std::complex<float> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, std::int64_t kl,
+                                      std::int64_t ku, std::complex<double> alpha,
+                                      const std::complex<double> *a, std::int64_t lda,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      std::complex<double> beta, std::complex<double> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, float alpha, const float *a,
+                                      std::int64_t lda, const float *x, std::int64_t incx,
+                                      float beta, float *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, double alpha, const double *a,
+                                      std::int64_t lda, const double *x, std::int64_t incx,
+                                      double beta, double *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, std::complex<float> alpha,
+                                      const std::complex<float> *a, std::int64_t lda,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      std::complex<float> beta, std::complex<float> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans,
+                                      std::int64_t m, std::int64_t n, std::complex<double> alpha,
+                                      const std::complex<double> *a, std::int64_t lda,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      std::complex<double> beta, std::complex<double> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sger_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                     float alpha, const float *x, std::int64_t incx, const float *y,
+                                     std::int64_t incy, float *a, std::int64_t lda,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dger_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                     double alpha, const double *x, std::int64_t incx,
+                                     const double *y, std::int64_t incy, double *a,
+                                     std::int64_t lda,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgerc_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                      std::complex<float> alpha, const std::complex<float> *x,
+                                      std::int64_t incx, const std::complex<float> *y,
+                                      std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgerc_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                      std::complex<double> alpha, const std::complex<double> *x,
+                                      std::int64_t incx, const std::complex<double> *y,
+                                      std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgeru_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                      std::complex<float> alpha, const std::complex<float> *x,
+                                      std::int64_t incx, const std::complex<float> *y,
+                                      std::int64_t incy, std::complex<float> *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgeru_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n,
+                                      std::complex<double> alpha, const std::complex<double> *x,
+                                      std::int64_t incx, const std::complex<double> *y,
+                                      std::int64_t incy, std::complex<double> *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*chbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                                      const std::complex<float> *a, std::int64_t lda,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      std::complex<float> beta, std::complex<float> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zhbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                                      const std::complex<double> *a, std::int64_t lda,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      std::complex<double> beta, std::complex<double> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*chemv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<float> alpha,
+                                      const std::complex<float> *a, std::int64_t lda,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      std::complex<float> beta, std::complex<float> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zhemv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<double> alpha,
+                                      const std::complex<double> *a, std::int64_t lda,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      std::complex<double> beta, std::complex<double> *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cher_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, float alpha, const std::complex<float> *x,
+                                     std::int64_t incx, std::complex<float> *a, std::int64_t lda,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zher_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, double alpha, const std::complex<double> *x,
+                                     std::int64_t incx, std::complex<double> *a, std::int64_t lda,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cher2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<float> alpha,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      const std::complex<float> *y, std::int64_t incy,
+                                      std::complex<float> *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zher2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<double> alpha,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      const std::complex<double> *y, std::int64_t incy,
+                                      std::complex<double> *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*chpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<float> alpha,
+                                      const std::complex<float> *a, const std::complex<float> *x,
+                                      std::int64_t incx, std::complex<float> beta,
+                                      std::complex<float> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zhpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<double> alpha,
+                                      const std::complex<double> *a, const std::complex<double> *x,
+                                      std::int64_t incx, std::complex<double> beta,
+                                      std::complex<double> *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*chpr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, float alpha, const std::complex<float> *x,
+                                     std::int64_t incx, std::complex<float> *a,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zhpr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, double alpha, const std::complex<double> *x,
+                                     std::int64_t incx, std::complex<double> *a,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*chpr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<float> alpha,
+                                      const std::complex<float> *x, std::int64_t incx,
+                                      const std::complex<float> *y, std::int64_t incy,
+                                      std::complex<float> *a,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zhpr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::complex<double> alpha,
+                                      const std::complex<double> *x, std::int64_t incx,
+                                      const std::complex<double> *y, std::int64_t incy,
+                                      std::complex<double> *a,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ssbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::int64_t k, float alpha, const float *a,
+                                      std::int64_t lda, const float *x, std::int64_t incx,
+                                      float beta, float *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, std::int64_t k, double alpha, const double *a,
+                                      std::int64_t lda, const double *x, std::int64_t incx,
+                                      double beta, double *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sspmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, float alpha, const float *a, const float *x,
+                                      std::int64_t incx, float beta, float *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dspmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, double alpha, const double *a,
+                                      const double *x, std::int64_t incx, double beta, double *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sspr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, float alpha, const float *x, std::int64_t incx,
+                                     float *a,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dspr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, double alpha, const double *x,
+                                     std::int64_t incx, double *a,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sspr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, float alpha, const float *x,
+                                      std::int64_t incx, const float *y, std::int64_t incy,
+                                      float *a,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dspr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, double alpha, const double *x,
+                                      std::int64_t incx, const double *y, std::int64_t incy,
+                                      double *a,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ssymv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, float alpha, const float *a, std::int64_t lda,
+                                      const float *x, std::int64_t incx, float beta, float *y,
+                                      std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsymv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, double alpha, const double *a,
+                                      std::int64_t lda, const double *x, std::int64_t incx,
+                                      double beta, double *y, std::int64_t incy,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ssyr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, float alpha, const float *x, std::int64_t incx,
+                                     float *a, std::int64_t lda,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsyr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                     std::int64_t n, double alpha, const double *x,
+                                     std::int64_t incx, double *a, std::int64_t lda,
+                                     const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ssyr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, float alpha, const float *x,
+                                      std::int64_t incx, const float *y, std::int64_t incy,
+                                      float *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsyr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      std::int64_t n, double alpha, const double *x,
+                                      std::int64_t incx, const double *y, std::int64_t incy,
+                                      double *a, std::int64_t lda,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*stbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const float *a,
+                                      std::int64_t lda, float *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const double *a,
+                                      std::int64_t lda, double *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const std::complex<float> *a,
+                                      std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const std::complex<double> *a,
+                                      std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*stbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const float *a,
+                                      std::int64_t lda, float *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const double *a,
+                                      std::int64_t lda, double *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const std::complex<float> *a,
+                                      std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, std::int64_t k, const std::complex<double> *a,
+                                      std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*stpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const float *a, float *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const double *a, double *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<float> *a,
+                                      std::complex<float> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<double> *a,
+                                      std::complex<double> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*stpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const float *a, float *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const double *a, double *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<float> *a,
+                                      std::complex<float> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<double> *a,
+                                      std::complex<double> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*strmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const float *a, std::int64_t lda, float *x,
+                                      std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtrmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const double *a, std::int64_t lda, double *x,
+                                      std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctrmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<float> *a,
+                                      std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztrmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<double> *a,
+                                      std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*strsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const float *a, std::int64_t lda, float *x,
+                                      std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtrsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const double *a, std::int64_t lda, double *x,
+                                      std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctrsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<float> *a,
+                                      std::int64_t lda, std::complex<float> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztrsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, onemkl::diag unit_diag,
+                                      std::int64_t n, const std::complex<double> *a,
+                                      std::int64_t lda, std::complex<double> *x, std::int64_t incx,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa,
+                                      onemkl::transpose transb, std::int64_t m, std::int64_t n,
+                                      std::int64_t k, float alpha, const float *a, std::int64_t lda,
+                                      const float *b, std::int64_t ldb, float beta, float *c,
+                                      std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa,
+                                      onemkl::transpose transb, std::int64_t m, std::int64_t n,
+                                      std::int64_t k, double alpha, const double *a,
+                                      std::int64_t lda, const double *b, std::int64_t ldb,
+                                      double beta, double *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa,
+                                      onemkl::transpose transb, std::int64_t m, std::int64_t n,
+                                      std::int64_t k, std::complex<float> alpha,
+                                      const std::complex<float> *a, std::int64_t lda,
+                                      const std::complex<float> *b, std::int64_t ldb,
+                                      std::complex<float> beta, std::complex<float> *c,
+                                      std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa,
+                                      onemkl::transpose transb, std::int64_t m, std::int64_t n,
+                                      std::int64_t k, std::complex<double> alpha,
+                                      const std::complex<double> *a, std::int64_t lda,
+                                      const std::complex<double> *b, std::int64_t ldb,
+                                      std::complex<double> beta, std::complex<double> *c,
+                                      std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*chemm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, std::int64_t m, std::int64_t n,
+                                      std::complex<float> alpha, const std::complex<float> *a,
+                                      std::int64_t lda, const std::complex<float> *b,
+                                      std::int64_t ldb, std::complex<float> beta,
+                                      std::complex<float> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zhemm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, std::int64_t m, std::int64_t n,
+                                      std::complex<double> alpha, const std::complex<double> *a,
+                                      std::int64_t lda, const std::complex<double> *b,
+                                      std::int64_t ldb, std::complex<double> beta,
+                                      std::complex<double> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cherk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                      float alpha, const std::complex<float> *a, std::int64_t lda,
+                                      float beta, std::complex<float> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zherk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                      double alpha, const std::complex<double> *a, std::int64_t lda,
+                                      double beta, std::complex<double> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cher2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                       std::complex<float> alpha, const std::complex<float> *a,
+                                       std::int64_t lda, const std::complex<float> *b,
+                                       std::int64_t ldb, float beta, std::complex<float> *c,
+                                       std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zher2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                       std::complex<double> alpha, const std::complex<double> *a,
+                                       std::int64_t lda, const std::complex<double> *b,
+                                       std::int64_t ldb, double beta, std::complex<double> *c,
+                                       std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ssymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, std::int64_t m, std::int64_t n,
+                                      float alpha, const float *a, std::int64_t lda, const float *b,
+                                      std::int64_t ldb, float beta, float *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, std::int64_t m, std::int64_t n,
+                                      double alpha, const double *a, std::int64_t lda,
+                                      const double *b, std::int64_t ldb, double beta, double *c,
+                                      std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*csymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, std::int64_t m, std::int64_t n,
+                                      std::complex<float> alpha, const std::complex<float> *a,
+                                      std::int64_t lda, const std::complex<float> *b,
+                                      std::int64_t ldb, std::complex<float> beta,
+                                      std::complex<float> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zsymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, std::int64_t m, std::int64_t n,
+                                      std::complex<double> alpha, const std::complex<double> *a,
+                                      std::int64_t lda, const std::complex<double> *b,
+                                      std::int64_t ldb, std::complex<double> beta,
+                                      std::complex<double> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ssyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                      float alpha, const float *a, std::int64_t lda, float beta,
+                                      float *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                      double alpha, const double *a, std::int64_t lda, double beta,
+                                      double *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*csyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                      std::complex<float> alpha, const std::complex<float> *a,
+                                      std::int64_t lda, std::complex<float> beta,
+                                      std::complex<float> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zsyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                      onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                      std::complex<double> alpha, const std::complex<double> *a,
+                                      std::int64_t lda, std::complex<double> beta,
+                                      std::complex<double> *c, std::int64_t ldc,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ssyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                       float alpha, const float *a, std::int64_t lda,
+                                       const float *b, std::int64_t ldb, float beta, float *c,
+                                       std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dsyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                       double alpha, const double *a, std::int64_t lda,
+                                       const double *b, std::int64_t ldb, double beta, double *c,
+                                       std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*csyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                       std::complex<float> alpha, const std::complex<float> *a,
+                                       std::int64_t lda, const std::complex<float> *b,
+                                       std::int64_t ldb, std::complex<float> beta,
+                                       std::complex<float> *c, std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zsyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose trans, std::int64_t n, std::int64_t k,
+                                       std::complex<double> alpha, const std::complex<double> *a,
+                                       std::int64_t lda, const std::complex<double> *b,
+                                       std::int64_t ldb, std::complex<double> beta,
+                                       std::complex<double> *c, std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*strmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      float alpha, const float *a, std::int64_t lda, float *b,
+                                      std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtrmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      double alpha, const double *a, std::int64_t lda, double *b,
+                                      std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctrmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      std::complex<float> alpha, const std::complex<float> *a,
+                                      std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztrmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      std::complex<double> alpha, const std::complex<double> *a,
+                                      std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*strsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      float alpha, const float *a, std::int64_t lda, float *b,
+                                      std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dtrsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      double alpha, const double *a, std::int64_t lda, double *b,
+                                      std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ctrsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      std::complex<float> alpha, const std::complex<float> *a,
+                                      std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*ztrsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right,
+                                      onemkl::uplo upper_lower, onemkl::transpose trans,
+                                      onemkl::diag unit_diag, std::int64_t m, std::int64_t n,
+                                      std::complex<double> alpha, const std::complex<double> *a,
+                                      std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
+                                      const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sgemm_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb,
+        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const float **a,
+        std::int64_t *lda, const float **b, std::int64_t *ldb, float *beta, float **c,
+        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dgemm_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb,
+        std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, const double **a,
+        std::int64_t *lda, const double **b, std::int64_t *ldb, double *beta, double **c,
+        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgemm_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb,
+        std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<float> *alpha,
+        const std::complex<float> **a, std::int64_t *lda, const std::complex<float> **b,
+        std::int64_t *ldb, std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
+        std::int64_t group_count, std::int64_t *group_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgemm_batch_group_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb,
+        std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
+        const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **b,
+        std::int64_t *ldb, std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
+        std::int64_t group_count, std::int64_t *group_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sgemm_batch_strided_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+        std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
+        std::int64_t stride_a, const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
+        float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dgemm_batch_strided_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+        std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
+        std::int64_t stride_a, const double *b, std::int64_t ldb, std::int64_t stride_b,
+        double beta, double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgemm_batch_strided_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+        std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
+        std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb,
+        std::int64_t stride_b, std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
+        std::int64_t stride_c, std::int64_t batch_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgemm_batch_strided_usm_sycl)(
+        cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m,
+        std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
+        std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb,
+        std::int64_t stride_b, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
+        std::int64_t stride_c, std::int64_t batch_size,
+        const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*sgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose transa, onemkl::transpose transb,
+                                       std::int64_t n, std::int64_t k, float alpha, const float *a,
+                                       std::int64_t lda, const float *b, std::int64_t ldb,
+                                       float beta, float *c, std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*dgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose transa, onemkl::transpose transb,
+                                       std::int64_t n, std::int64_t k, double alpha,
+                                       const double *a, std::int64_t lda, const double *b,
+                                       std::int64_t ldb, double beta, double *c, std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*cgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose transa, onemkl::transpose transb,
+                                       std::int64_t n, std::int64_t k, std::complex<float> alpha,
+                                       const std::complex<float> *a, std::int64_t lda,
+                                       const std::complex<float> *b, std::int64_t ldb,
+                                       std::complex<float> beta, std::complex<float> *c,
+                                       std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+    cl::sycl::event (*zgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower,
+                                       onemkl::transpose transa, onemkl::transpose transb,
+                                       std::int64_t n, std::int64_t k, std::complex<double> alpha,
+                                       const std::complex<double> *a, std::int64_t lda,
+                                       const std::complex<double> *b, std::int64_t ldb,
+                                       std::complex<double> beta, std::complex<double> *c,
+                                       std::int64_t ldc,
+                                       const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
 } function_table_t;
 
 #endif //_BLAS_FUNCTION_TABLE_HPP_
diff --git a/src/include/exceptions_helper.hpp b/src/include/exceptions_helper.hpp
new file mode 100644
index 000000000..80d1ddca9
--- /dev/null
+++ b/src/include/exceptions_helper.hpp
@@ -0,0 +1,34 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#ifndef __EXCEPTIONS_HELPER_HPP
+#define __EXCEPTIONS_HELPER_HPP
+
+#include <stdexcept>
+
+namespace onemkl {
+
+class backend_unsupported_exception : public std::runtime_error {
+public:
+    backend_unsupported_exception() : std::runtime_error("Not yet supported for this backend") {}
+};
+
+} // namespace onemkl
+
+#endif // __EXCEPTIONS_HELPER_HPP
diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt
index 4d364d1fa..1a3a00f0b 100644
--- a/tests/unit_tests/CMakeLists.txt
+++ b/tests/unit_tests/CMakeLists.txt
@@ -86,10 +86,12 @@ if(BUILD_SHARED_LIBS)
   gtest_discover_tests(test_main_rt
     PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib
     PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
+    DISCOVERY_TIMEOUT 30
   )
 endif()
 
 gtest_discover_tests(test_main_ct
   PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib
   PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
+  DISCOVERY_TIMEOUT 30
 )
diff --git a/tests/unit_tests/blas/batch/CMakeLists.txt b/tests/unit_tests/blas/batch/CMakeLists.txt
index 321cea698..f14c50b8a 100644
--- a/tests/unit_tests/blas/batch/CMakeLists.txt
+++ b/tests/unit_tests/blas/batch/CMakeLists.txt
@@ -18,7 +18,7 @@
 #===============================================================================
 
 # Build object from all test sources
-set(BATCH_SOURCES "gemm_batch.cpp" "gemm_batch_stride.cpp" "trsm_batch.cpp" "trsm_batch_stride.cpp")
+set(BATCH_SOURCES "gemm_batch_stride.cpp" "trsm_batch_stride.cpp" "gemm_batch_usm.cpp" "gemm_batch_stride_usm.cpp" "axpy_batch_usm.cpp")
 
 if(BUILD_SHARED_LIBS)
   add_library(blas_batch_rt OBJECT ${BATCH_SOURCES})
diff --git a/tests/unit_tests/blas/batch/axpy_batch_usm.cpp b/tests/unit_tests/blas/batch/axpy_batch_usm.cpp
new file mode 100644
index 000000000..c5d6d5963
--- /dev/null
+++ b/tests/unit_tests/blas/batch/axpy_batch_usm.cpp
@@ -0,0 +1,239 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "allocator_helper.hpp"
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int64_t group_count) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    int64_t *n    = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    int64_t *incx = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    int64_t *incy = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    fp *alpha     = (fp *)onemkl::malloc_shared(64, sizeof(fp) * group_count, dev, cxt);
+    int64_t *group_size =
+        (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+
+    if ((n == NULL) || (incx == NULL) || (incy == NULL) || (alpha == NULL) ||
+        (group_size == NULL)) {
+        std::cout << "Error cannot allocate input arrays\n";
+        onemkl::free_shared(n, cxt);
+        onemkl::free_shared(incx, cxt);
+        onemkl::free_shared(incy, cxt);
+        onemkl::free_shared(alpha, cxt);
+        onemkl::free_shared(group_size, cxt);
+        return false;
+    }
+
+    int64_t i;
+    int64_t j, idx = 0;
+    int64_t total_size_x, total_size_y;
+    int64_t total_batch_count = 0;
+
+    for (i = 0; i < group_count; i++) {
+        group_size[i] = 1 + std::rand() % 100;
+        n[i]          = 1 + std::rand() % 500;
+        incx[i]       = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2;
+        incy[i]       = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2;
+        alpha[i]      = rand_scalar<fp>();
+        total_batch_count += group_size[i];
+    }
+
+    fp **x_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt);
+    fp **y_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt);
+    fp **y_ref_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt);
+
+    if ((x_array == NULL) || (y_array == NULL) || (y_ref_array == NULL)) {
+        std::cout << "Error cannot allocate arrays of pointers\n";
+        onemkl::free_shared(x_array, cxt);
+        onemkl::free_shared(y_array, cxt);
+        onemkl::free_shared(y_ref_array, cxt);
+        return false;
+    }
+    idx = 0;
+    for (i = 0; i < group_count; i++) {
+        for (j = 0; j < group_size[i]; j++) {
+            total_size_x     = (1 + (n[i] - 1) * std::abs(incx[i]));
+            total_size_y     = (1 + (n[i] - 1) * std::abs(incy[i]));
+            x_array[idx]     = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_x, dev, cxt);
+            y_array[idx]     = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_y, dev, cxt);
+            y_ref_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_y, dev, cxt);
+            rand_vector(x_array[idx], n[i], incx[i]);
+            rand_vector(y_array[idx], n[i], incy[i]);
+            copy_vector(y_array[idx], n[i], incy[i], y_ref_array[idx]);
+            idx++;
+        }
+    }
+
+    // Call reference AXPY_BATCH.
+    using fp_ref = typename ref_type_info<fp>::type;
+    int n_ref, incx_ref, incy_ref;
+
+    idx = 0;
+    for (i = 0; i < group_count; i++) {
+        for (j = 0; j < group_size[i]; j++) {
+            n_ref    = (int)n[i];
+            incx_ref = (int)incx[i];
+            incy_ref = (int)incy[i];
+            ::axpy((const int *)&n_ref, (const fp_ref *)&alpha[i], (const fp_ref *)x_array[idx],
+                   (const int *)&incx_ref, (fp_ref *)y_ref_array[idx], (const int *)&incy_ref);
+            idx++;
+        }
+    }
+
+    // Call DPC++ AXPY_BATCH.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::axpy_batch(main_queue, n, alpha, (const fp **)x_array, incx, y_array,
+                                        incy, group_count, group_size, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::axpy_batch,
+                    (main_queue, n, alpha, (const fp **)x_array, incx, y_array, incy, group_count,
+                     group_size, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during AXPY_BATCH:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        idx = 0;
+        for (i = 0; i < group_count; i++) {
+            for (j = 0; j < group_size[i]; j++) {
+                onemkl::free_shared(x_array[idx], cxt);
+                onemkl::free_shared(y_array[idx], cxt);
+                onemkl::free_shared(y_ref_array[idx], cxt);
+                idx++;
+            }
+        }
+        onemkl::free_shared(n, cxt);
+        onemkl::free_shared(incx, cxt);
+        onemkl::free_shared(incy, cxt);
+        onemkl::free_shared(alpha, cxt);
+        onemkl::free_shared(group_size, cxt);
+        onemkl::free_shared(x_array, cxt);
+        onemkl::free_shared(y_array, cxt);
+        onemkl::free_shared(y_ref_array, cxt);
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of AXPY_BATCH:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+    bool good = true;
+    idx       = 0;
+    for (i = 0; i < group_count; i++) {
+        for (j = 0; j < group_size[i]; j++) {
+            good = good && check_equal_vector(y_array[idx], y_ref_array[idx], n[i], incy[i], n[i],
+                                              std::cout);
+            idx++;
+        }
+    }
+
+    idx = 0;
+    for (i = 0; i < group_count; i++) {
+        for (j = 0; j < group_size[i]; j++) {
+            onemkl::free_shared(x_array[idx], cxt);
+            onemkl::free_shared(y_array[idx], cxt);
+            onemkl::free_shared(y_ref_array[idx], cxt);
+            idx++;
+        }
+    }
+    onemkl::free_shared(n, cxt);
+    onemkl::free_shared(incx, cxt);
+    onemkl::free_shared(incy, cxt);
+    onemkl::free_shared(alpha, cxt);
+    onemkl::free_shared(group_size, cxt);
+    onemkl::free_shared(x_array, cxt);
+    onemkl::free_shared(y_array, cxt);
+    onemkl::free_shared(y_ref_array, cxt);
+    return (int)good;
+}
+
+class AxpyBatchUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(AxpyBatchUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 5));
+}
+
+TEST_P(AxpyBatchUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 5));
+}
+
+TEST_P(AxpyBatchUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 5));
+}
+
+TEST_P(AxpyBatchUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 5));
+}
+
+INSTANTIATE_TEST_SUITE_P(AxpyBatchUsmTestSuite, AxpyBatchUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemm_batch.cpp b/tests/unit_tests/blas/batch/gemm_batch.cpp
deleted file mode 100644
index 9dffd0634..000000000
--- a/tests/unit_tests/blas/batch/gemm_batch.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/*******************************************************************************
-* Copyright 2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#include <CL/sycl.hpp>
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "onemkl/detail/config.hpp"
-#include "onemkl/onemkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace cl::sycl;
-using std::vector;
-
-extern std::vector<cl::sycl::device> devices;
-
-namespace {
-
-template <typename fp>
-bool test(const device &dev, int64_t group_count) {
-    // Prepare data.
-    int64_t *m   = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *n   = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *k   = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *lda = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *ldb = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *ldc = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    onemkl::transpose *transa =
-        (onemkl::transpose *)onemkl::aligned_alloc(64, sizeof(onemkl::transpose) * group_count);
-    onemkl::transpose *transb =
-        (onemkl::transpose *)onemkl::aligned_alloc(64, sizeof(onemkl::transpose) * group_count);
-    fp *alpha           = (fp *)onemkl::aligned_alloc(64, sizeof(fp) * group_count);
-    fp *beta            = (fp *)onemkl::aligned_alloc(64, sizeof(fp) * group_count);
-    int64_t *group_size = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-
-    if ((m == NULL) || (n == NULL) || (k == NULL) || (lda == NULL) || (ldb == NULL) ||
-        (ldc == NULL) || (transa == NULL) || (transb == NULL) || (alpha == NULL) ||
-        (beta == NULL) || (group_size == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        onemkl::aligned_free(m);
-        onemkl::aligned_free(n);
-        onemkl::aligned_free(k);
-        onemkl::aligned_free(lda);
-        onemkl::aligned_free(ldb);
-        onemkl::aligned_free(ldc);
-        onemkl::aligned_free(transa);
-        onemkl::aligned_free(transb);
-        onemkl::aligned_free(alpha);
-        onemkl::aligned_free(beta);
-        onemkl::aligned_free(group_size);
-        return false;
-    }
-
-    int64_t i, tmp;
-    int64_t j, idx = 0, max_k = 0;
-    int64_t total_size_a = 0, total_size_b = 0, total_size_c = 0, total_batch_count = 0;
-    int64_t size_a = 0, size_b = 0, size_c = 0;
-    int64_t off_a = 0, off_b = 0, off_c = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i]          = 1 + std::rand() % 500;
-        n[i]          = 1 + std::rand() % 500;
-        k[i]          = 1 + std::rand() % 500;
-        lda[i]        = std::max(m[i], k[i]);
-        ldb[i]        = std::max(n[i], k[i]);
-        ldc[i]        = std::max(m[i], n[i]);
-        alpha[i]      = rand_scalar<fp>();
-        beta[i]       = rand_scalar<fp>();
-        if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-            transa[i] = (onemkl::transpose)(std::rand() % 2);
-            transb[i] = (onemkl::transpose)(std::rand() % 2);
-        }
-        else {
-            tmp = std::rand() % 3;
-            if (tmp == 2)
-                transa[i] = onemkl::transpose::conjtrans;
-            else
-                transa[i] = (onemkl::transpose)tmp;
-            tmp = std::rand() % 3;
-            if (tmp == 2)
-                transb[i] = onemkl::transpose::conjtrans;
-            else
-                transb[i] = (onemkl::transpose)tmp;
-        }
-        total_size_a +=
-            lda[i] * group_size[i] * ((transa[i] == onemkl::transpose::nontrans) ? k[i] : m[i]);
-        total_size_b +=
-            ldb[i] * group_size[i] * ((transb[i] == onemkl::transpose::nontrans) ? n[i] : k[i]);
-        total_size_c += ldc[i] * n[i] * group_size[i];
-        total_batch_count += group_size[i];
-    }
-
-    fp **a_array     = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count);
-    fp **b_array     = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count);
-    fp **c_array     = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count);
-    fp **c_ref_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count);
-
-    if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        onemkl::aligned_free(a_array);
-        onemkl::aligned_free(b_array);
-        onemkl::aligned_free(c_array);
-        onemkl::aligned_free(c_ref_array);
-        return false;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(total_size_a), B(total_size_b), C(total_size_c),
-        C_ref(total_size_c);
-
-    for (i = 0; i < group_count; i++) {
-        max_k  = std::max(max_k, k[i]);
-        size_a = (transa[i] == onemkl::transpose::nontrans) ? k[i] * lda[i] : m[i] * lda[i];
-        size_b = (transb[i] == onemkl::transpose::nontrans) ? n[i] * ldb[i] : k[i] * ldb[i];
-        size_c = n[i] * ldc[i];
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx]     = A.data() + off_a;
-            b_array[idx]     = B.data() + off_b;
-            c_array[idx]     = C.data() + off_c;
-            c_ref_array[idx] = C_ref.data() + off_c;
-            rand_matrix(a_array[idx], transa[i], m[i], k[i], lda[i]);
-            rand_matrix(b_array[idx], transb[i], k[i], n[i], ldb[i]);
-            rand_matrix(c_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldc[i]);
-            off_a += size_a;
-            off_b += size_b;
-            off_c += size_c;
-            idx++;
-        }
-    }
-    C_ref = C;
-
-    // Call reference GEMM_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref, n_ref, k_ref, lda_ref, ldb_ref, ldc_ref, group_size_ref;
-    CBLAS_TRANSPOSE transa_ref, transb_ref;
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        m_ref          = (int)m[i];
-        n_ref          = (int)n[i];
-        k_ref          = (int)k[i];
-        lda_ref        = (int)lda[i];
-        ldb_ref        = (int)ldb[i];
-        ldc_ref        = (int)ldc[i];
-        group_size_ref = (int)group_size[i];
-        transa_ref     = convert_to_cblas_trans(transa[i]);
-        transb_ref     = convert_to_cblas_trans(transb[i]);
-        for (j = 0; j < group_size_ref; j++) {
-            ::gemm(transa_ref, transb_ref, (const int *)&m_ref, (const int *)&n_ref,
-                   (const int *)&k_ref, (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx],
-                   (const int *)&lda_ref, (const fp_ref *)b_array[idx], (const int *)&ldb_ref,
-                   (const fp_ref *)&beta[i], (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref);
-            idx++;
-        }
-    }
-
-    // Call DPC++ GEMM_BATCH.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH:\n"
-                          << e.what() << std::endl
-                          << "OpenCL status: " << e.get_cl_code() << std::endl;
-            }
-        }
-    };
-
-    queue main_queue(dev, exception_handler);
-
-    buffer<onemkl::transpose, 1> transa_buffer(transa, range<1>(group_count));
-    buffer<onemkl::transpose, 1> transb_buffer(transb, range<1>(group_count));
-    buffer<std::int64_t, 1> m_buffer(m, range<1>(group_count));
-    buffer<std::int64_t, 1> n_buffer(n, range<1>(group_count));
-    buffer<std::int64_t, 1> k_buffer(k, range<1>(group_count));
-    buffer<std::int64_t, 1> lda_buffer(lda, range<1>(group_count));
-    buffer<std::int64_t, 1> ldb_buffer(ldb, range<1>(group_count));
-    buffer<std::int64_t, 1> ldc_buffer(ldc, range<1>(group_count));
-    buffer<std::int64_t, 1> group_size_buffer(group_size, range<1>(group_count));
-    buffer<fp, 1> alpha_buffer(alpha, range<1>(group_count));
-    buffer<fp, 1> beta_buffer(beta, range<1>(group_count));
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        onemkl::blas::gemm_batch(main_queue, transa_buffer, transb_buffer, m_buffer, n_buffer,
-                                 k_buffer, alpha_buffer, A_buffer, lda_buffer, B_buffer, ldb_buffer,
-                                 beta_buffer, C_buffer, ldc_buffer, group_count, group_size_buffer);
-#else
-        TEST_RUN_CT(main_queue, onemkl::blas::gemm_batch,
-                    (main_queue, transa_buffer, transb_buffer, m_buffer, n_buffer, k_buffer,
-                     alpha_buffer, A_buffer, lda_buffer, B_buffer, ldb_buffer, beta_buffer,
-                     C_buffer, ldc_buffer, group_count, group_size_buffer));
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM_BATCH:\n"
-                  << e.what() << std::endl
-                  << "OpenCL status: " << e.get_cl_code() << std::endl;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMM_BATCH:\n" << error.what() << std::endl;
-#ifdef ENABLE_CUBLAS_BACKEND
-        // GEMM_BATCH currently not supported with CUBLAS backend.
-        std::string error_msg(error.what());
-        if (error_msg.compare("Not implemented for cublas") == 0) {
-            onemkl::aligned_free(m);
-            onemkl::aligned_free(n);
-            onemkl::aligned_free(k);
-            onemkl::aligned_free(lda);
-            onemkl::aligned_free(ldb);
-            onemkl::aligned_free(ldc);
-            onemkl::aligned_free(transa);
-            onemkl::aligned_free(transb);
-            onemkl::aligned_free(alpha);
-            onemkl::aligned_free(beta);
-            onemkl::aligned_free(group_size);
-            onemkl::aligned_free(a_array);
-            onemkl::aligned_free(b_array);
-            onemkl::aligned_free(c_array);
-            onemkl::aligned_free(c_ref_array);
-            return true;
-        }
-#endif
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good;
-    {
-        auto C_accessor = C_buffer.template get_access<access::mode::read>();
-        good = check_equal_matrix(C_accessor, C_ref, total_size_c, 1, total_size_c, 10 * max_k,
-                                  std::cout);
-    }
-
-    onemkl::aligned_free(m);
-    onemkl::aligned_free(n);
-    onemkl::aligned_free(k);
-    onemkl::aligned_free(lda);
-    onemkl::aligned_free(ldb);
-    onemkl::aligned_free(ldc);
-    onemkl::aligned_free(transa);
-    onemkl::aligned_free(transb);
-    onemkl::aligned_free(alpha);
-    onemkl::aligned_free(beta);
-    onemkl::aligned_free(group_size);
-    onemkl::aligned_free(a_array);
-    onemkl::aligned_free(b_array);
-    onemkl::aligned_free(c_array);
-    onemkl::aligned_free(c_ref_array);
-
-    return good;
-}
-
-class GemmBatchTests : public ::testing::TestWithParam<cl::sycl::device> {};
-
-TEST_P(GemmBatchTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), 5));
-}
-
-TEST_P(GemmBatchTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), 5));
-}
-
-TEST_P(GemmBatchTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 5));
-}
-
-TEST_P(GemmBatchTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmBatchTestSuite, GemmBatchTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride.cpp
index a7f4e32d8..2e2bc14b1 100644
--- a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp
+++ b/tests/unit_tests/blas/batch/gemm_batch_stride.cpp
@@ -44,7 +44,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int64_t batch_size) {
+int test(const device &dev, int64_t batch_size) {
     // Prepare data.
     int64_t m, n, k;
     int64_t lda, ldb, ldc;
@@ -153,16 +153,13 @@ bool test(const device &dev, int64_t batch_size) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
     catch (const std::runtime_error &error) {
         std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n"
                   << error.what() << std::endl;
-#ifdef ENABLE_CUBLAS_BACKEND
-        // GEMM_BATCH_STRIDE currently not supported with CUBLAS backend
-        std::string error_msg(error.what());
-        if (error_msg.compare("Not implemented for cublas") == 0) {
-            return true;
-        }
-#endif
     }
 
     // Compare the results of reference implementation and DPC++ implementation.
@@ -173,25 +170,25 @@ bool test(const device &dev, int64_t batch_size) {
                                   stride_c * batch_size, 10 * k, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class GemmBatchStrideTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(GemmBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), 5));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 5));
 }
 
 TEST_P(GemmBatchStrideTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), 5));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 5));
 }
 
 TEST_P(GemmBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 5));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 5));
 }
 
 TEST_P(GemmBatchStrideTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 5));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 5));
 }
 
 INSTANTIATE_TEST_SUITE_P(GemmBatchStrideTestSuite, GemmBatchStrideTests,
diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp
new file mode 100644
index 000000000..602b883e1
--- /dev/null
+++ b/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp
@@ -0,0 +1,228 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "allocator_helper.hpp"
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int64_t batch_size) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    int64_t m, n, k;
+    int64_t lda, ldb, ldc;
+    onemkl::transpose transa, transb;
+    fp alpha, beta;
+
+    int64_t i, tmp;
+
+    batch_size = 1 + std::rand() % 20;
+    m          = 1 + std::rand() % 500;
+    n          = 1 + std::rand() % 500;
+    k          = 1 + std::rand() % 500;
+    lda        = std::max(m, k);
+    ldb        = std::max(n, k);
+    ldc        = std::max(m, n);
+    alpha      = rand_scalar<fp>();
+    beta       = rand_scalar<fp>();
+    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
+        transa = (onemkl::transpose)(std::rand() % 2);
+        transb = (onemkl::transpose)(std::rand() % 2);
+    }
+    else {
+        tmp = std::rand() % 3;
+        if (tmp == 2)
+            transa = onemkl::transpose::conjtrans;
+        else
+            transa = (onemkl::transpose)tmp;
+        tmp = std::rand() % 3;
+        if (tmp == 2)
+            transb = onemkl::transpose::conjtrans;
+        else
+            transb = (onemkl::transpose)tmp;
+    }
+
+    int64_t stride_a, stride_b, stride_c;
+
+    stride_a = (transa == onemkl::transpose::nontrans) ? lda * k : lda * m;
+    stride_b = (transb == onemkl::transpose::nontrans) ? ldb * n : ldb * k;
+    stride_c = ldc * n;
+
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua), C(ua), C_ref(ua);
+
+    A.resize(stride_a * batch_size);
+    B.resize(stride_b * batch_size);
+    C.resize(stride_c * batch_size);
+    C_ref.resize(stride_c * batch_size);
+
+    fp **a_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt);
+    fp **b_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt);
+    fp **c_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt);
+    fp **c_ref_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt);
+
+    if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) {
+        std::cout << "Error cannot allocate arrays of pointers\n";
+        onemkl::free_shared(a_array, cxt);
+        onemkl::free_shared(b_array, cxt);
+        onemkl::free_shared(c_array, cxt);
+        onemkl::free_shared(c_ref_array, cxt);
+        return false;
+    }
+
+    for (i = 0; i < batch_size; i++) {
+        a_array[i]     = &A[i * stride_a];
+        b_array[i]     = &B[i * stride_b];
+        c_array[i]     = &C[i * stride_c];
+        c_ref_array[i] = &C_ref[i * stride_c];
+    }
+
+    rand_matrix(A, onemkl::transpose::nontrans, stride_a * batch_size, 1, stride_a * batch_size);
+    rand_matrix(B, onemkl::transpose::nontrans, stride_b * batch_size, 1, stride_b * batch_size);
+    rand_matrix(C, onemkl::transpose::nontrans, stride_c * batch_size, 1, stride_c * batch_size);
+    copy_matrix(C, onemkl::transpose::nontrans, stride_c * batch_size, 1, stride_c * batch_size,
+                C_ref);
+
+    // Call reference GEMM_BATCH_STRIDE.
+    using fp_ref       = typename ref_type_info<fp>::type;
+    int m_ref          = (int)m;
+    int n_ref          = (int)n;
+    int k_ref          = (int)k;
+    int lda_ref        = (int)lda;
+    int ldb_ref        = (int)ldb;
+    int ldc_ref        = (int)ldc;
+    int batch_size_ref = (int)batch_size;
+    for (i = 0; i < batch_size_ref; i++) {
+        ::gemm(convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), (const int *)&m_ref,
+               (const int *)&n_ref, (const int *)&k_ref, (const fp_ref *)&alpha,
+               (const fp_ref *)a_array[i], (const int *)&lda_ref, (const fp_ref *)b_array[i],
+               (const int *)&ldb_ref, (const fp_ref *)&beta, (fp_ref *)c_ref_array[i],
+               (const int *)&ldc_ref);
+    }
+
+    // Call DPC++ GEMM_BATCH_STRIDE.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::gemm_batch(main_queue, transa, transb, m, n, k, alpha, &A[0], lda,
+                                        stride_a, &B[0], ldb, stride_b, beta, &C[0], ldc, stride_c,
+                                        batch_size, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::gemm_batch,
+                    (main_queue, transa, transb, m, n, k, alpha, &A[0], lda, stride_a, &B[0], ldb,
+                     stride_b, beta, &C[0], ldc, stride_c, batch_size, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during GEMM_BATCH_STRIDE:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        onemkl::free_shared(a_array, cxt);
+        onemkl::free_shared(b_array, cxt);
+        onemkl::free_shared(c_array, cxt);
+        onemkl::free_shared(c_ref_array, cxt);
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n"
+                  << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+    bool good = true;
+    {
+        good = check_equal_matrix(C, C_ref, stride_c * batch_size, 1, stride_c * batch_size, 10 * k,
+                                  std::cout);
+    }
+
+    onemkl::free_shared(a_array, cxt);
+    onemkl::free_shared(b_array, cxt);
+    onemkl::free_shared(c_array, cxt);
+    onemkl::free_shared(c_ref_array, cxt);
+    return (int)good;
+}
+
+class GemmBatchStrideUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GemmBatchStrideUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 5));
+}
+
+TEST_P(GemmBatchStrideUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 5));
+}
+
+TEST_P(GemmBatchStrideUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 5));
+}
+
+TEST_P(GemmBatchStrideUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 5));
+}
+
+INSTANTIATE_TEST_SUITE_P(GemmBatchStrideUsmTestSuite, GemmBatchStrideUsmTests,
+                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp
new file mode 100644
index 000000000..6d9918798
--- /dev/null
+++ b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp
@@ -0,0 +1,370 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "allocator_helper.hpp"
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int64_t group_count) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    int64_t *m   = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    int64_t *n   = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    int64_t *k   = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    int64_t *lda = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    int64_t *ldb = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    int64_t *ldc = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+    onemkl::transpose *transa = (onemkl::transpose *)onemkl::malloc_shared(
+        64, sizeof(onemkl::transpose) * group_count, dev, cxt);
+    onemkl::transpose *transb = (onemkl::transpose *)onemkl::malloc_shared(
+        64, sizeof(onemkl::transpose) * group_count, dev, cxt);
+    fp *alpha = (fp *)onemkl::malloc_shared(64, sizeof(fp) * group_count, dev, cxt);
+    fp *beta  = (fp *)onemkl::malloc_shared(64, sizeof(fp) * group_count, dev, cxt);
+    int64_t *group_size =
+        (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt);
+
+    if ((m == NULL) || (n == NULL) || (k == NULL) || (lda == NULL) || (ldb == NULL) ||
+        (ldc == NULL) || (transa == NULL) || (transb == NULL) || (alpha == NULL) ||
+        (beta == NULL) || (group_size == NULL)) {
+        std::cout << "Error cannot allocate input arrays\n";
+        onemkl::free_shared(m, cxt);
+        onemkl::free_shared(n, cxt);
+        onemkl::free_shared(k, cxt);
+        onemkl::free_shared(lda, cxt);
+        onemkl::free_shared(ldb, cxt);
+        onemkl::free_shared(ldc, cxt);
+        onemkl::free_shared(transa, cxt);
+        onemkl::free_shared(transb, cxt);
+        onemkl::free_shared(alpha, cxt);
+        onemkl::free_shared(beta, cxt);
+        onemkl::free_shared(group_size, cxt);
+        return false;
+    }
+
+    int64_t i, tmp;
+    int64_t j, idx = 0;
+    int64_t total_batch_count = 0;
+
+    int64_t *total_size_a = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
+    int64_t *total_size_b = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
+    int64_t *total_size_c = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
+    if ((total_size_a == NULL) || (total_size_b == NULL) || (total_size_c == NULL)) {
+        std::cout << "Error cannot allocate input arrays\n";
+        onemkl::aligned_free(total_size_a);
+        onemkl::aligned_free(total_size_b);
+        onemkl::aligned_free(total_size_c);
+        return false;
+    }
+
+    for (i = 0; i < group_count; i++) {
+        group_size[i] = 1 + std::rand() % 20;
+        m[i]          = 1 + std::rand() % 500;
+        n[i]          = 1 + std::rand() % 500;
+        k[i]          = 1 + std::rand() % 500;
+        lda[i]        = std::max(m[i], k[i]);
+        ldb[i]        = std::max(n[i], k[i]);
+        ldc[i]        = std::max(m[i], n[i]);
+        alpha[i]      = rand_scalar<fp>();
+        beta[i]       = rand_scalar<fp>();
+        if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
+            transa[i] = (onemkl::transpose)(std::rand() % 2);
+            transb[i] = (onemkl::transpose)(std::rand() % 2);
+        }
+        else {
+            tmp = std::rand() % 3;
+            if (tmp == 2)
+                transa[i] = onemkl::transpose::conjtrans;
+            else
+                transa[i] = (onemkl::transpose)tmp;
+            tmp = std::rand() % 3;
+            if (tmp == 2)
+                transb[i] = onemkl::transpose::conjtrans;
+            else
+                transb[i] = (onemkl::transpose)tmp;
+        }
+        total_size_a[i] = lda[i] * ((transa[i] == onemkl::transpose::nontrans) ? k[i] : m[i]);
+        total_size_b[i] = ldb[i] * ((transb[i] == onemkl::transpose::nontrans) ? n[i] : k[i]);
+        total_size_c[i] = ldc[i] * n[i];
+        total_batch_count += group_size[i];
+    }
+
+    fp **a_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt);
+    fp **b_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt);
+    fp **c_array     = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt);
+    fp **c_ref_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt);
+
+    if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) {
+        std::cout << "Error cannot allocate arrays of pointers\n";
+        onemkl::free_shared(a_array, cxt);
+        onemkl::free_shared(b_array, cxt);
+        onemkl::free_shared(c_array, cxt);
+        onemkl::free_shared(c_ref_array, cxt);
+        return false;
+    }
+    idx = 0;
+    for (i = 0; i < group_count; i++) {
+        for (j = 0; j < group_size[i]; j++) {
+            a_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_a[i], dev, cxt);
+            b_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_b[i], dev, cxt);
+            c_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_c[i], dev, cxt);
+            c_ref_array[idx] =
+                (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_c[i], dev, cxt);
+
+            rand_matrix(a_array[idx], transa[i], m[i], k[i], lda[i]);
+            rand_matrix(b_array[idx], transb[i], k[i], n[i], ldb[i]);
+            rand_matrix(c_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldc[i]);
+            copy_matrix(c_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldc[i],
+                        c_ref_array[idx]);
+            idx++;
+        }
+    }
+
+    // Call reference GEMM_BATCH.
+    using fp_ref        = typename ref_type_info<fp>::type;
+    int *m_ref          = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count);
+    int *n_ref          = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count);
+    int *k_ref          = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count);
+    int *lda_ref        = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count);
+    int *ldb_ref        = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count);
+    int *ldc_ref        = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count);
+    int *group_size_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count);
+
+    CBLAS_TRANSPOSE *transa_ref =
+        (CBLAS_TRANSPOSE *)onemkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count);
+    CBLAS_TRANSPOSE *transb_ref =
+        (CBLAS_TRANSPOSE *)onemkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count);
+
+    if ((m_ref == NULL) || (n_ref == NULL) || (k_ref == NULL) || (lda_ref == NULL) ||
+        (ldb_ref == NULL) || (ldc_ref == NULL) || (transa_ref == NULL) || (transb_ref == NULL) ||
+        (group_size_ref == NULL)) {
+        std::cout << "Error cannot allocate input arrays\n";
+        onemkl::aligned_free(m_ref);
+        onemkl::aligned_free(n_ref);
+        onemkl::aligned_free(k_ref);
+        onemkl::aligned_free(lda_ref);
+        onemkl::aligned_free(ldb_ref);
+        onemkl::aligned_free(ldc_ref);
+        onemkl::aligned_free(transa_ref);
+        onemkl::aligned_free(transb_ref);
+        onemkl::aligned_free(group_size_ref);
+        return false;
+    }
+    idx = 0;
+    for (i = 0; i < group_count; i++) {
+        transa_ref[i]     = convert_to_cblas_trans(transa[i]);
+        transb_ref[i]     = convert_to_cblas_trans(transb[i]);
+        m_ref[i]          = (int)m[i];
+        n_ref[i]          = (int)n[i];
+        k_ref[i]          = (int)k[i];
+        lda_ref[i]        = (int)lda[i];
+        ldb_ref[i]        = (int)ldb[i];
+        ldc_ref[i]        = (int)ldc[i];
+        group_size_ref[i] = (int)group_size[i];
+        for (j = 0; j < group_size_ref[i]; j++) {
+            ::gemm(transa_ref[i], transb_ref[i], (const int *)&m_ref[i], (const int *)&n_ref[i],
+                   (const int *)&k_ref[i], (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx],
+                   (const int *)&lda_ref[i], (const fp_ref *)b_array[idx], (const int *)&ldb_ref[i],
+                   (const fp_ref *)&beta[i], (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref[i]);
+            idx++;
+        }
+    }
+
+    // Call DPC++ GEMM_BATCH.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::gemm_batch(main_queue, transa, transb, m, n, k, alpha,
+                                        (const fp **)a_array, lda, (const fp **)b_array, ldb, beta,
+                                        c_array, ldc, group_count, group_size, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(
+            main_queue, onemkl::blas::gemm_batch,
+            (main_queue, transa, transb, m, n, k, alpha, (const fp **)a_array, lda,
+             (const fp **)b_array, ldb, beta, c_array, ldc, group_count, group_size, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during GEMM_BATCH:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        onemkl::aligned_free(total_size_a);
+        onemkl::aligned_free(total_size_b);
+        onemkl::aligned_free(total_size_c);
+        onemkl::aligned_free(m_ref);
+        onemkl::aligned_free(n_ref);
+        onemkl::aligned_free(k_ref);
+        onemkl::aligned_free(lda_ref);
+        onemkl::aligned_free(ldb_ref);
+        onemkl::aligned_free(ldc_ref);
+        onemkl::aligned_free(transa_ref);
+        onemkl::aligned_free(transb_ref);
+        onemkl::aligned_free(group_size_ref);
+        idx = 0;
+        for (i = 0; i < group_count; i++) {
+            for (j = 0; j < group_size[i]; j++) {
+                onemkl::free_shared(a_array[idx], cxt);
+                onemkl::free_shared(b_array[idx], cxt);
+                onemkl::free_shared(c_array[idx], cxt);
+                onemkl::free_shared(c_ref_array[idx], cxt);
+                idx++;
+            }
+        }
+        onemkl::free_shared(m, cxt);
+        onemkl::free_shared(n, cxt);
+        onemkl::free_shared(k, cxt);
+        onemkl::free_shared(lda, cxt);
+        onemkl::free_shared(ldb, cxt);
+        onemkl::free_shared(ldc, cxt);
+        onemkl::free_shared(transa, cxt);
+        onemkl::free_shared(transb, cxt);
+        onemkl::free_shared(alpha, cxt);
+        onemkl::free_shared(beta, cxt);
+        onemkl::free_shared(group_size, cxt);
+        onemkl::free_shared(a_array, cxt);
+        onemkl::free_shared(b_array, cxt);
+        onemkl::free_shared(c_array, cxt);
+        onemkl::free_shared(c_ref_array, cxt);
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GEMM_BATCH:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+    bool good = true;
+    {
+        idx = 0;
+        for (i = 0; i < group_count; i++) {
+            for (j = 0; j < group_size[i]; j++) {
+                good = good && check_equal_matrix(c_array[idx], c_ref_array[idx], m[i], n[i],
+                                                  ldc[i], 10 * k[i], std::cout);
+                idx++;
+            }
+        }
+    }
+
+    onemkl::aligned_free(total_size_a);
+    onemkl::aligned_free(total_size_b);
+    onemkl::aligned_free(total_size_c);
+    onemkl::aligned_free(m_ref);
+    onemkl::aligned_free(n_ref);
+    onemkl::aligned_free(k_ref);
+    onemkl::aligned_free(lda_ref);
+    onemkl::aligned_free(ldb_ref);
+    onemkl::aligned_free(ldc_ref);
+    onemkl::aligned_free(transa_ref);
+    onemkl::aligned_free(transb_ref);
+    onemkl::aligned_free(group_size_ref);
+    idx = 0;
+    for (i = 0; i < group_count; i++) {
+        for (j = 0; j < group_size[i]; j++) {
+            onemkl::free_shared(a_array[idx], cxt);
+            onemkl::free_shared(b_array[idx], cxt);
+            onemkl::free_shared(c_array[idx], cxt);
+            onemkl::free_shared(c_ref_array[idx], cxt);
+            idx++;
+        }
+    }
+    onemkl::free_shared(m, cxt);
+    onemkl::free_shared(n, cxt);
+    onemkl::free_shared(k, cxt);
+    onemkl::free_shared(lda, cxt);
+    onemkl::free_shared(ldb, cxt);
+    onemkl::free_shared(ldc, cxt);
+    onemkl::free_shared(transa, cxt);
+    onemkl::free_shared(transb, cxt);
+    onemkl::free_shared(alpha, cxt);
+    onemkl::free_shared(beta, cxt);
+    onemkl::free_shared(group_size, cxt);
+    onemkl::free_shared(a_array, cxt);
+    onemkl::free_shared(b_array, cxt);
+    onemkl::free_shared(c_array, cxt);
+    onemkl::free_shared(c_ref_array, cxt);
+    return (int)good;
+}
+
+class GemmBatchUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GemmBatchUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 5));
+}
+
+TEST_P(GemmBatchUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 5));
+}
+
+TEST_P(GemmBatchUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 5));
+}
+
+TEST_P(GemmBatchUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 5));
+}
+
+INSTANTIATE_TEST_SUITE_P(GemmBatchUsmTestSuite, GemmBatchUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/trsm_batch.cpp b/tests/unit_tests/blas/batch/trsm_batch.cpp
deleted file mode 100644
index 758303833..000000000
--- a/tests/unit_tests/blas/batch/trsm_batch.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*******************************************************************************
-* Copyright 2020 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#include <CL/sycl.hpp>
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "onemkl/detail/config.hpp"
-#include "onemkl/onemkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace cl::sycl;
-using std::vector;
-
-extern std::vector<cl::sycl::device> devices;
-
-namespace {
-
-template <typename fp>
-bool test(const device &dev, int64_t group_count) {
-    // Prepare data.
-    int64_t *m   = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *n   = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *lda = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    int64_t *ldb = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-    onemkl::transpose *trans =
-        (onemkl::transpose *)onemkl::aligned_alloc(64, sizeof(onemkl::transpose) * group_count);
-    onemkl::side *left_right =
-        (onemkl::side *)onemkl::aligned_alloc(64, sizeof(onemkl::side) * group_count);
-    onemkl::uplo *upper_lower =
-        (onemkl::uplo *)onemkl::aligned_alloc(64, sizeof(onemkl::uplo) * group_count);
-    onemkl::diag *unit_diag =
-        (onemkl::diag *)onemkl::aligned_alloc(64, sizeof(onemkl::diag) * group_count);
-    fp *alpha           = (fp *)onemkl::aligned_alloc(64, sizeof(fp) * group_count);
-    int64_t *group_size = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count);
-
-    if ((m == NULL) || (n == NULL) || (lda == NULL) || (ldb == NULL) || (trans == NULL) ||
-        (left_right == NULL) || (upper_lower == NULL) || (unit_diag == NULL) || (alpha == NULL) ||
-        (group_size == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        onemkl::aligned_free(m);
-        onemkl::aligned_free(n);
-        onemkl::aligned_free(lda);
-        onemkl::aligned_free(ldb);
-        onemkl::aligned_free(trans);
-        onemkl::aligned_free(left_right);
-        onemkl::aligned_free(upper_lower);
-        onemkl::aligned_free(unit_diag);
-        onemkl::aligned_free(alpha);
-        onemkl::aligned_free(group_size);
-        return false;
-    }
-
-    int64_t i, tmp;
-    int64_t j, idx = 0, max = 0;
-    int64_t total_size_a = 0, total_size_b = 0, total_batch_count = 0;
-    int64_t size_a = 0, size_b = 0;
-    int64_t off_a = 0, off_b = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i]          = 1 + std::rand() % 50;
-        n[i]          = 1 + std::rand() % 50;
-        lda[i]        = std::max(m[i], n[i]);
-        ldb[i]        = std::max(n[i], m[i]);
-        alpha[i]      = rand_scalar<fp>();
-        if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-            trans[i] = (onemkl::transpose)(std::rand() % 2);
-        }
-        else {
-            tmp = std::rand() % 3;
-            if (tmp == 2)
-                trans[i] = onemkl::transpose::conjtrans;
-            else
-                trans[i] = (onemkl::transpose)tmp;
-        }
-        left_right[i]  = (onemkl::side)(std::rand() % 2);
-        upper_lower[i] = (onemkl::uplo)(std::rand() % 2);
-        unit_diag[i]   = (onemkl::diag)(std::rand() % 2);
-    }
-
-    for (i = 0; i < group_count; i++) {
-        total_size_a +=
-            lda[i] * group_size[i] * ((left_right[i] == onemkl::side::left) ? m[i] : n[i]);
-        total_size_b += ldb[i] * group_size[i] * n[i];
-        total_batch_count += group_size[i];
-    }
-
-    fp **a_array     = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count);
-    fp **b_array     = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count);
-    fp **b_ref_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count);
-
-    if ((a_array == NULL) || (b_array == NULL) || (b_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        onemkl::aligned_free(a_array);
-        onemkl::aligned_free(b_array);
-        onemkl::aligned_free(b_ref_array);
-        return false;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(total_size_a), B(total_size_b), B_ref(total_size_b);
-
-    for (i = 0; i < group_count; i++) {
-        max    = std::max(max, m[i]);
-        max    = std::max(max, n[i]);
-        size_a = (left_right[i] == onemkl::side::left) ? m[i] * lda[i] : n[i] * lda[i];
-        size_b = ldb[i] * n[i];
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx]     = A.data() + off_a;
-            b_array[idx]     = B.data() + off_b;
-            b_ref_array[idx] = B_ref.data() + off_b;
-            if (left_right[i] == onemkl::side::left)
-                rand_trsm_matrix(a_array[idx], trans[i], m[i], m[i], lda[i]);
-            else
-                rand_trsm_matrix(a_array[idx], trans[i], n[i], n[i], lda[i]);
-            rand_matrix(b_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldb[i]);
-            off_a += size_a;
-            off_b += size_b;
-            idx++;
-        }
-    }
-
-    B_ref = B;
-
-    // Call reference TRSM_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref, n_ref, lda_ref, ldb_ref, group_size_ref;
-    CBLAS_TRANSPOSE trans_ref;
-    CBLAS_SIDE side_ref;
-    CBLAS_DIAG diag_ref;
-    CBLAS_UPLO uplo_ref;
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        m_ref          = (int)m[i];
-        n_ref          = (int)n[i];
-        lda_ref        = (int)lda[i];
-        ldb_ref        = (int)ldb[i];
-        group_size_ref = (int)group_size[i];
-        trans_ref      = convert_to_cblas_trans(trans[i]);
-        side_ref       = convert_to_cblas_side(left_right[i]);
-        diag_ref       = convert_to_cblas_diag(unit_diag[i]);
-        uplo_ref       = convert_to_cblas_uplo(upper_lower[i]);
-        for (j = 0; j < group_size_ref; j++) {
-            ::trsm(side_ref, uplo_ref, trans_ref, diag_ref, (const int *)&m_ref,
-                   (const int *)&n_ref, (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx],
-                   (const int *)&lda_ref, (fp_ref *)b_ref_array[idx], (const int *)&ldb_ref);
-            idx++;
-        }
-    }
-
-    // Call DPC++ TRSM_BATCH.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH:\n"
-                          << e.what() << std::endl
-                          << "OpenCL status: " << e.get_cl_code() << std::endl;
-            }
-        }
-    };
-
-    queue main_queue(dev, exception_handler);
-
-    buffer<onemkl::side, 1> side_buffer(left_right, range<1>(group_count));
-    buffer<onemkl::uplo, 1> uplo_buffer(upper_lower, range<1>(group_count));
-    buffer<onemkl::transpose, 1> trans_buffer(trans, range<1>(group_count));
-    buffer<onemkl::diag, 1> diag_buffer(unit_diag, range<1>(group_count));
-    buffer<int64_t, 1> m_buffer(m, range<1>(group_count));
-    buffer<int64_t, 1> n_buffer(n, range<1>(group_count));
-    buffer<int64_t, 1> lda_buffer(lda, range<1>(group_count));
-    buffer<int64_t, 1> ldb_buffer(ldb, range<1>(group_count));
-    buffer<int64_t, 1> group_size_buffer(group_size, range<1>(group_count));
-    buffer<fp, 1> alpha_buffer(alpha, range<1>(group_count));
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-
-    try {
-#ifdef CALL_RT_API
-        onemkl::blas::trsm_batch(main_queue, side_buffer, uplo_buffer, trans_buffer, diag_buffer,
-                                 m_buffer, n_buffer, alpha_buffer, A_buffer, lda_buffer, B_buffer,
-                                 ldb_buffer, group_count, group_size_buffer);
-#else
-        TEST_RUN_CT(main_queue, onemkl::blas::trsm_batch,
-                    (main_queue, side_buffer, uplo_buffer, trans_buffer, diag_buffer, m_buffer,
-                     n_buffer, alpha_buffer, A_buffer, lda_buffer, B_buffer, ldb_buffer,
-                     group_count, group_size_buffer));
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during TRSM_BATCH:\n"
-                  << e.what() << std::endl
-                  << "OpenCL status: " << e.get_cl_code() << std::endl;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of TRSM_BATCH:\n" << error.what() << std::endl;
-#ifdef ENABLE_CUBLAS_BACKEND
-        // TRSM_BATCH currently not supported with CUBLAS backend.
-        std::string error_msg(error.what());
-        if (error_msg.compare("Not implemented for cublas") == 0) {
-            onemkl::aligned_free(m);
-            onemkl::aligned_free(n);
-            onemkl::aligned_free(lda);
-            onemkl::aligned_free(ldb);
-            onemkl::aligned_free(trans);
-            onemkl::aligned_free(left_right);
-            onemkl::aligned_free(upper_lower);
-            onemkl::aligned_free(unit_diag);
-            onemkl::aligned_free(alpha);
-            onemkl::aligned_free(group_size);
-            onemkl::aligned_free(a_array);
-            onemkl::aligned_free(b_array);
-            onemkl::aligned_free(b_ref_array);
-            return true;
-        }
-#endif
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good;
-    {
-        auto B_accessor = B_buffer.template get_access<access::mode::read>();
-        good = check_equal_trsm_matrix(B_accessor, B_ref, total_size_b, 1, total_size_b, 10 * max,
-                                       std::cout);
-    }
-
-    onemkl::aligned_free(m);
-    onemkl::aligned_free(n);
-    onemkl::aligned_free(lda);
-    onemkl::aligned_free(ldb);
-    onemkl::aligned_free(trans);
-    onemkl::aligned_free(left_right);
-    onemkl::aligned_free(upper_lower);
-    onemkl::aligned_free(unit_diag);
-    onemkl::aligned_free(alpha);
-    onemkl::aligned_free(group_size);
-    onemkl::aligned_free(a_array);
-    onemkl::aligned_free(b_array);
-    onemkl::aligned_free(b_ref_array);
-
-    return good;
-}
-
-class TrsmBatchTests : public ::testing::TestWithParam<cl::sycl::device> {};
-
-TEST_P(TrsmBatchTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), 5));
-}
-
-TEST_P(TrsmBatchTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), 5));
-}
-
-TEST_P(TrsmBatchTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 5));
-}
-
-TEST_P(TrsmBatchTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsmBatchTestSuite, TrsmBatchTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp b/tests/unit_tests/blas/batch/trsm_batch_stride.cpp
index 6b1bd5fe0..5b461d198 100644
--- a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp
+++ b/tests/unit_tests/blas/batch/trsm_batch_stride.cpp
@@ -44,7 +44,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev) {
+int test(const device &dev) {
     // Prepare data.
     int64_t m, n;
     int64_t lda, ldb;
@@ -151,16 +151,13 @@ bool test(const device &dev) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
     catch (const std::runtime_error &error) {
         std::cout << "Error raised during execution of TRSM_BATCH_STRIDE:\n"
                   << error.what() << std::endl;
-#ifdef ENABLE_CUBLAS_BACKEND
-        // TRSM_BATCH_STRIDE currently not supported with CUBLAS backend.
-        std::string error_msg(error.what());
-        if (error_msg.compare("Not implemented for cublas") == 0) {
-            return true;
-        }
-#endif
     }
 
     // Compare the results of reference implementation and DPC++ implementation.
@@ -171,25 +168,25 @@ bool test(const device &dev) {
                                        10 * std::max(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TrsmBatchStrideTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TrsmBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam()));
+    EXPECT_TRUEORSKIP(test<float>(GetParam()));
 }
 
 TEST_P(TrsmBatchStrideTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam()));
+    EXPECT_TRUEORSKIP(test<double>(GetParam()));
 }
 
 TEST_P(TrsmBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam()));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam()));
 }
 
 TEST_P(TrsmBatchStrideTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam()));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam()));
 }
 
 INSTANTIATE_TEST_SUITE_P(TrsmBatchStrideTestSuite, TrsmBatchStrideTests,
diff --git a/tests/unit_tests/blas/extensions/CMakeLists.txt b/tests/unit_tests/blas/extensions/CMakeLists.txt
index 1a248e5ec..6401a55ac 100644
--- a/tests/unit_tests/blas/extensions/CMakeLists.txt
+++ b/tests/unit_tests/blas/extensions/CMakeLists.txt
@@ -18,7 +18,7 @@
 #===============================================================================
 
 # Build object from all test sources
-set(EXTENSIONS_SOURCES "gemm_ext.cpp" "gemm_ext_off.cpp" "gemmt.cpp")
+set(EXTENSIONS_SOURCES "gemm_ext.cpp" "gemm_ext_off.cpp" "gemmt.cpp" "gemmt_usm.cpp")
 
 if(BUILD_SHARED_LIBS)
   add_library(blas_extensions_rt OBJECT ${EXTENSIONS_SOURCES})
diff --git a/tests/unit_tests/blas/extensions/gemm_ext.cpp b/tests/unit_tests/blas/extensions/gemm_ext.cpp
index b951fed30..d1633431b 100644
--- a/tests/unit_tests/blas/extensions/gemm_ext.cpp
+++ b/tests/unit_tests/blas/extensions/gemm_ext.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename Ta, typename Tc>
-bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n,
-          int k, int lda, int ldb, int ldc, Tc alpha, Tc beta) {
+int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, int k,
+         int lda, int ldb, int ldc, Tc alpha, Tc beta) {
     // Prepare data.
     vector<Ta, allocator_helper<Ta, 64>> A, B;
     vector<Tc, allocator_helper<Tc, 64>> C, C_ref;
@@ -103,22 +103,19 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
     catch (const std::runtime_error& error) {
         std::cout << "Error raised during execution of GEMM_EXT:\n" << error.what() << std::endl;
-#ifdef ENABLE_CUBLAS_BACKEND
-        // GEMM_EXT currently not supported with CUBLAS backend.
-        std::string error_msg(error.what());
-        if (error_msg.compare("Not implemented for cublas") == 0) {
-            return true;
-        }
-#endif
     }
 
     // Compare the results of reference implementation and DPC++ implementation.
     auto C_accessor = C_buffer.template get_access<access::mode::read>();
     bool good       = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * k, std::cout);
 
-    return good;
+    return (int)good;
 }
 
 class GemmExtTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -126,62 +123,67 @@ class GemmExtTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(GemmExtTests, HalfHalfFloatPrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<half, float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79,
                            83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<half, float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79,
                            83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<half, float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79,
                            83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<half, float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans,
-                                   79, 83, 91, 103, 105, 106, alpha, beta)));
+    EXPECT_TRUEORSKIP(
+        (test<half, float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83,
+                           91, 103, 105, 106, alpha, beta)));
 }
 
 TEST_P(GemmExtTests, RealHalfPrecision) {
     half alpha(2.0);
     half beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<half, half>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79,
                           83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<half, half>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans,
-                                  79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<half, half>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans,
-                                  79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<half, half>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans,
-                                  79, 83, 91, 103, 105, 106, alpha, beta)));
+    EXPECT_TRUEORSKIP(
+        (test<half, half>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83,
+                          91, 103, 105, 106, alpha, beta)));
+    EXPECT_TRUEORSKIP(
+        (test<half, half>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83,
+                          91, 103, 105, 106, alpha, beta)));
+    EXPECT_TRUEORSKIP(
+        (test<half, half>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83,
+                          91, 103, 105, 106, alpha, beta)));
 }
 
 TEST_P(GemmExtTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<float, float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans,
                             79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<float, float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79,
                             83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<float, float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79,
                             83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans,
-                                    79, 83, 91, 103, 105, 106, alpha, beta)));
+    EXPECT_TRUEORSKIP(
+        (test<float, float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83,
+                            91, 103, 105, 106, alpha, beta)));
 }
 
 TEST_P(GemmExtTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<double, double>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans,
                               79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<double, double>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79,
                               83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<double, double>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79,
                               83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<double, double>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79,
                               83, 91, 103, 105, 106, alpha, beta)));
 }
@@ -189,31 +191,31 @@ TEST_P(GemmExtTests, RealDoublePrecision) {
 TEST_P(GemmExtTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
         alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::trans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
         GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 79, 83, 91, 103,
         105, 106, alpha, beta)));
 }
@@ -221,31 +223,31 @@ TEST_P(GemmExtTests, ComplexSinglePrecision) {
 TEST_P(GemmExtTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
         alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::trans, 79, 83, 91, 103, 105,
         106, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(
+    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
         GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 79, 83, 91, 103,
         105, 106, alpha, beta)));
 }
diff --git a/tests/unit_tests/blas/extensions/gemm_ext_off.cpp b/tests/unit_tests/blas/extensions/gemm_ext_off.cpp
index 46c4ef477..9e28bc57f 100644
--- a/tests/unit_tests/blas/extensions/gemm_ext_off.cpp
+++ b/tests/unit_tests/blas/extensions/gemm_ext_off.cpp
@@ -44,9 +44,9 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename Ts, typename Ta, typename Tb, typename Tc>
-bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
-          onemkl::offset offsetc, int m, int n, int k, int lda, int ldb, int ldc, Ts alpha,
-          Ts beta) {
+int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
+         onemkl::offset offsetc, int m, int n, int k, int lda, int ldb, int ldc, Ts alpha,
+         Ts beta) {
     // Prepare data.
     vector<Ta, allocator_helper<Ta, 64>> A;
     vector<Tb, allocator_helper<Tb, 64>> B;
@@ -67,7 +67,7 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
 
     C_ref = C;
 
-    // Call Reference GEMM_EXT.
+    // Call Reference GEMM_EXT_OFF.
     const int m_ref = m, n_ref = n, k_ref = k;
     const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
 
@@ -81,7 +81,7 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
                (Ta_ref*)A.data(), &lda_ref, (Ta_ref*)&ao, (Tb_ref*)B.data(), &ldb_ref, (Tb_ref*)&bo,
                (Ts_ref*)&beta, (Tc_ref*)C_ref.data(), &ldc_ref, (Tc_ref*)co.data());
 
-    // Call DPC++ GEMM_EXT.
+    // Call DPC++ GEMM_EXT_OFF.
 
     // Catch asynchronous exceptions.
     auto exception_handler = [](exception_list exceptions) {
@@ -90,7 +90,7 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
                 std::rethrow_exception(e);
             }
             catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_EXT:\n"
+                std::cout << "Caught asynchronous SYCL exception during GEMM_EXT_OFF:\n"
                           << e.what() << std::endl
                           << "OpenCL status: " << e.get_cl_code() << std::endl;
             }
@@ -115,27 +115,25 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
 #endif
     }
     catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM_EXT:\n"
+        std::cout << "Caught synchronous SYCL exception during GEMM_EXT_OFF:\n"
                   << e.what() << std::endl
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
     catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of GEMM_EXT:\n" << error.what() << std::endl;
-#ifdef ENABLE_CUBLAS_BACKEND
-        // GEMM_EXT currently not supported with CUBLAS backend.
-        std::string error_msg(error.what());
-        if (error_msg.compare("Not implemented for cublas") == 0) {
-            return true;
-        }
-#endif
+        std::cout << "Error raised during execution of GEMM_EXT_OFF:\n"
+                  << error.what() << std::endl;
     }
 
     // Compare the results of reference implementation and DPC++ implementation.
     auto C_accessor = C_buffer.template get_access<access::mode::read>();
     bool good       = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * k, std::cout);
 
-    return good;
+    return (int)good;
 }
 
 class GemmExtOffTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -143,40 +141,40 @@ class GemmExtOffTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(GemmExtOffTests, Int8Uint8Int32Precision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, onemkl::offset::fix,
         79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, onemkl::offset::fix, 79,
         83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, onemkl::offset::fix, 79,
         83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, onemkl::offset::fix, 79, 83,
         91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans,
         onemkl::offset::column, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, onemkl::offset::column,
         79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, onemkl::offset::column,
         79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, onemkl::offset::column, 79,
         83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, onemkl::offset::row,
         79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, onemkl::offset::row, 79,
         83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, onemkl::offset::row, 79,
         83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUE((test<float, int8_t, uint8_t, int32_t>(
+    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
         GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, onemkl::offset::row, 79, 83,
         91, 103, 105, 106, alpha, beta)));
 }
diff --git a/tests/unit_tests/blas/extensions/gemmt.cpp b/tests/unit_tests/blas/extensions/gemmt.cpp
index 6c2605a61..6f95bd33e 100644
--- a/tests/unit_tests/blas/extensions/gemmt.cpp
+++ b/tests/unit_tests/blas/extensions/gemmt.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
-          onemkl::transpose transb, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::transpose transb, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
     rand_matrix(A, transa, n, k, lda);
@@ -101,22 +101,19 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
     catch (const std::runtime_error& error) {
         std::cout << "Error raised during execution of GEMMT:\n" << error.what() << std::endl;
-#ifdef ENABLE_CUBLAS_BACKEND
-        // GEMMT currently not supported with CUBLAS backend.
-        std::string error_msg(error.what());
-        if (error_msg.compare("Not implemented for cublas") == 0) {
-            return true;
-        }
-#endif
     }
 
     // Compare the results of reference implementation and DPC++ implementation.
     auto C_accessor = C_buffer.template get_access<access::mode::read>();
     bool good = check_equal_matrix(C_accessor, C_ref, upper_lower, n, n, ldc, 10 * k, std::cout);
 
-    return good;
+    return (int)good;
 }
 
 class GemmtTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -124,100 +121,104 @@ class GemmtTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(GemmtTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
 }
 
 TEST_P(GemmtTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
 }
 
 TEST_P(GemmtTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0);
     std::complex<float> beta(3.0);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::transpose::nontrans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::transpose::trans, 27,
-                                          98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::transpose::nontrans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::transpose::trans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::transpose::trans,
+                                                27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
         27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::transpose::nontrans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::transpose::trans, 27,
-                                          98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::transpose::nontrans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::transpose::trans,
-                                          27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::transpose::trans,
+                                                27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
         27, 98, 101, 102, 103, alpha, beta));
 }
@@ -225,58 +226,58 @@ TEST_P(GemmtTests, ComplexSinglePrecision) {
 TEST_P(GemmtTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0);
     std::complex<double> beta(3.0);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::transpose::nontrans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::transpose::trans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::transpose::nontrans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::transpose::trans, 27,
-                                           98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::nontrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::nontrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::transpose::trans,
+                                                 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
         27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::transpose::conjtrans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::nontrans,
         27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::transpose::trans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
         27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::transpose::nontrans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::transpose::trans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::transpose::nontrans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::transpose::trans, 27,
-                                           98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::nontrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::nontrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::transpose::trans,
+                                                 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
         27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::transpose::conjtrans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::nontrans,
         27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::transpose::trans,
-                                           27, 98, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
         27, 98, 101, 102, 103, alpha, beta));
 }
diff --git a/tests/unit_tests/blas/extensions/gemmt_usm.cpp b/tests/unit_tests/blas/extensions/gemmt_usm.cpp
new file mode 100644
index 000000000..b70aad7d1
--- /dev/null
+++ b/tests/unit_tests/blas/extensions/gemmt_usm.cpp
@@ -0,0 +1,289 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::transpose transb, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during GEMMT:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
+    rand_matrix(A, transa, n, k, lda);
+    rand_matrix(B, transb, k, n, ldb);
+    rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference GEMMT.
+    const int n_ref = n, k_ref = k;
+    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::gemmt(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
+            convert_to_cblas_trans(transb), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
+            &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
+
+    // Call DPC++ GEMMT.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::gemmt(main_queue, upper_lower, transa, transb, n, k, alpha, A.data(),
+                                   lda, B.data(), ldb, beta, C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::gemmt,
+                    (main_queue, upper_lower, transa, transb, n, k, alpha, A.data(), lda, B.data(),
+                     ldb, beta, C.data(), ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during GEMMT:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of GEMMT:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+    bool good = check_equal_matrix(C, C_ref, upper_lower, n, n, ldc, 10 * k, std::cout);
+
+    return (int)good;
+}
+
+class GemmtUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GemmtUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+}
+
+TEST_P(GemmtUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+}
+
+TEST_P(GemmtUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0);
+    std::complex<float> beta(3.0);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::transpose::trans,
+                                                27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::transpose::trans,
+                                                27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                  onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+}
+
+TEST_P(GemmtUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0);
+    std::complex<double> beta(3.0);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::nontrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::nontrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::transpose::trans,
+                                                 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::nontrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::nontrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::nontrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::transpose::trans,
+                                                 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27,
+        98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::nontrans,
+        27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                   onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans,
+        27, 98, 101, 102, 103, alpha, beta));
+}
+
+INSTANTIATE_TEST_SUITE_P(GemmtUsmTestSuite, GemmtUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/include/test_common.hpp b/tests/unit_tests/blas/include/test_common.hpp
index badfbbb59..9c794f833 100644
--- a/tests/unit_tests/blas/include/test_common.hpp
+++ b/tests/unit_tests/blas/include/test_common.hpp
@@ -23,10 +23,19 @@
 #include <algorithm>
 
 #include <complex>
+#include <stdexcept>
 #include <type_traits>
 
 #include <CL/sycl.hpp>
 
+// Exceptions
+namespace onemkl {
+class backend_unsupported_exception : public std::runtime_error {
+public:
+    backend_unsupported_exception() : std::runtime_error("Not yet supported for this backend") {}
+};
+} // namespace onemkl
+
 namespace std {
 static cl::sycl::half abs(cl::sycl::half v) {
     if (v < cl::sycl::half(0))
@@ -161,6 +170,13 @@ std::complex<double> rand_scalar(int mag) {
     return rand_complex_scalar<double>(mag);
 }
 
+template <typename fp>
+void rand_vector(fp *v, int n, int inc) {
+    int abs_inc = std::abs(inc);
+    for (int i = 0; i < n; i++)
+        v[i * abs_inc] = rand_scalar<fp>();
+}
+
 template <typename vec>
 void rand_vector(vec &v, int n, int inc) {
     using fp    = typename vec::value_type;
@@ -186,6 +202,13 @@ void print_matrix(vec &M, onemkl::transpose trans, int m, int n, int ld, char *n
     }
 }
 
+template <typename fp>
+void copy_vector(fp *src, int n, int inc, fp *dest) {
+    int abs_inc = std::abs(inc);
+    for (int i = 0; i < n; i++)
+        dest[i * abs_inc] = src[i * abs_inc];
+}
+
 template <typename vec_src, typename vec_dest>
 void copy_matrix(vec_src &src, onemkl::transpose trans, int m, int n, int ld, vec_dest &dest) {
     using T_data = typename vec_dest::value_type;
@@ -202,6 +225,20 @@ void copy_matrix(vec_src &src, onemkl::transpose trans, int m, int n, int ld, ve
     }
 }
 
+template <typename fp>
+void copy_matrix(fp *src, onemkl::transpose trans, int m, int n, int ld, fp *dest) {
+    if (trans == onemkl::transpose::nontrans) {
+        for (int j = 0; j < n; j++)
+            for (int i = 0; i < m; i++)
+                dest[i + j * ld] = (fp)src[i + j * ld];
+    }
+    else {
+        for (int i = 0; i < m; i++)
+            for (int j = 0; j < n; j++)
+                dest[j + i * ld] = (fp)src[j + i * ld];
+    }
+}
+
 template <typename vec>
 void rand_matrix(vec &M, onemkl::transpose trans, int m, int n, int ld) {
     using fp = typename vec::value_type;
diff --git a/tests/unit_tests/blas/level1/CMakeLists.txt b/tests/unit_tests/blas/level1/CMakeLists.txt
index f5213d0ef..08c977739 100644
--- a/tests/unit_tests/blas/level1/CMakeLists.txt
+++ b/tests/unit_tests/blas/level1/CMakeLists.txt
@@ -18,7 +18,7 @@
 #===============================================================================
 
 # Build object from all test sources
-set(L1_SOURCES "nrm2.cpp" "iamin.cpp" "iamax.cpp" "dotu.cpp" "dot.cpp" "dotc.cpp" "copy.cpp" "axpy.cpp" "asum.cpp" "swap.cpp" "sdsdot.cpp" "scal.cpp" "rotmg.cpp" "rotm.cpp" "rotg.cpp" "rot.cpp")
+set(L1_SOURCES "nrm2.cpp" "iamin.cpp" "iamax.cpp" "dotu.cpp" "dot.cpp" "dotc.cpp" "copy.cpp" "axpy.cpp" "asum.cpp" "swap.cpp" "sdsdot.cpp" "scal.cpp" "rotmg.cpp" "rotm.cpp" "rotg.cpp" "rot.cpp" "nrm2_usm.cpp" "iamin_usm.cpp" "iamax_usm.cpp" "dotu_usm.cpp" "dot_usm.cpp" "dotc_usm.cpp" "copy_usm.cpp" "axpy_usm.cpp" "asum_usm.cpp" "swap_usm.cpp" "sdsdot_usm.cpp" "scal_usm.cpp" "rotmg_usm.cpp" "rotm_usm.cpp" "rotg_usm.cpp" "rot_usm.cpp")
 
 if(BUILD_SHARED_LIBS)
   add_library(blas_level1_rt OBJECT ${L1_SOURCES})
diff --git a/tests/unit_tests/blas/level1/asum.cpp b/tests/unit_tests/blas/level1/asum.cpp
index 2b192f79d..54d559064 100644
--- a/tests/unit_tests/blas/level1/asum.cpp
+++ b/tests/unit_tests/blas/level1/asum.cpp
@@ -23,7 +23,6 @@
 #include <limits>
 #include <vector>
 
-#include <gtest/gtest.h>
 #include <CL/sycl.hpp>
 #include "cblas.h"
 #include "onemkl/detail/config.hpp"
@@ -33,6 +32,8 @@
 #include "test_common.hpp"
 #include "test_helper.hpp"
 
+#include <gtest/gtest.h>
+
 using namespace cl::sycl;
 using std::vector;
 
@@ -41,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_res>
-bool test(const device& dev, int64_t N, int64_t incx) {
+int test(const device& dev, int64_t N, int64_t incx) {
     // Prepare data.
     vector<fp> x;
     fp_res result = fp_res(-1), result_ref = fp_res(-1);
@@ -87,6 +88,14 @@ bool test(const device& dev, int64_t N, int64_t incx) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of ASUM:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -94,33 +103,33 @@ bool test(const device& dev, int64_t N, int64_t incx) {
         good                 = check_equal(result_accessor[0], result_ref, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class AsumTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(AsumTests, RealSinglePrecision) {
-    EXPECT_TRUE((::test<float, float>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((::test<float, float>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((::test<float, float>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((::test<float, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((::test<float, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((::test<float, float>(GetParam(), 1357, -3)));
 }
 
 TEST_P(AsumTests, RealDoublePrecision) {
-    EXPECT_TRUE((::test<double, double>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((::test<double, double>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((::test<double, double>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((::test<double, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((::test<double, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((::test<double, double>(GetParam(), 1357, -3)));
 }
 
 TEST_P(AsumTests, ComplexSinglePrecision) {
-    EXPECT_TRUE((::test<std::complex<float>, float>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((::test<std::complex<float>, float>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((::test<std::complex<float>, float>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(GetParam(), 1357, -3)));
 }
 
 TEST_P(AsumTests, ComplexDoublePrecision) {
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -3)));
 }
 
 INSTANTIATE_TEST_SUITE_P(AsumTestSuite, AsumTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/asum_usm.cpp b/tests/unit_tests/blas/level1/asum_usm.cpp
new file mode 100644
index 000000000..5d3ec812d
--- /dev/null
+++ b/tests/unit_tests/blas/level1/asum_usm.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_res>
+int test(const device& dev, int64_t N, int64_t incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during ASUM:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua);
+    fp_res result_ref = fp_res(-1);
+
+    rand_vector(x, N, incx);
+
+    // Call Reference ASUM.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = std::abs(incx);
+
+    result_ref = ::asum<fp_ref, fp_res>(&N_ref, (fp_ref*)x.data(), &incx_ref);
+
+    // Call DPC++ ASUM.
+
+    auto result_p = (fp_res*)onemkl::malloc_shared(64, sizeof(fp_res), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::asum(main_queue, N, x.data(), incx, result_p, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::asum,
+                    (main_queue, N, x.data(), incx, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during ASUM:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of ASUM:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_ref, N, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+
+    return (int)good;
+}
+
+class AsumUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(AsumUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP((::test<float, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((::test<float, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((::test<float, float>(GetParam(), 1357, -3)));
+}
+
+TEST_P(AsumUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP((::test<double, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((::test<double, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((::test<double, double>(GetParam(), 1357, -3)));
+}
+
+TEST_P(AsumUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(GetParam(), 1357, -3)));
+}
+
+TEST_P(AsumUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -3)));
+}
+
+INSTANTIATE_TEST_SUITE_P(AsumUsmTestSuite, AsumUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/axpy.cpp b/tests/unit_tests/blas/level1/axpy.cpp
index 950d6c16b..c5ec0a67a 100644
--- a/tests/unit_tests/blas/level1/axpy.cpp
+++ b/tests/unit_tests/blas/level1/axpy.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int N, int incx, int incy, fp alpha) {
+int test(const device &dev, int N, int incx, int incy, fp alpha) {
     // Prepare data.
     vector<fp> x, y, y_ref;
 
@@ -92,6 +92,14 @@ bool test(const device &dev, int N, int incx, int incy, fp alpha) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -99,34 +107,34 @@ bool test(const device &dev, int N, int incx, int incy, fp alpha) {
         good            = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class AxpyTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(AxpyTests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2, 3, alpha));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1, 1, alpha));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -3, -2, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3, -2, alpha));
 }
 TEST_P(AxpyTests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2, 3, alpha));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1, 1, alpha));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -3, -2, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3, -2, alpha));
 }
 TEST_P(AxpyTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 2, 3, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 1, 1, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, -3, -2, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2, alpha));
 }
 TEST_P(AxpyTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 2, 3, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 1, 1, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, -3, -2, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2, alpha));
 }
 
 INSTANTIATE_TEST_SUITE_P(AxpyTestSuite, AxpyTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/axpy_usm.cpp b/tests/unit_tests/blas/level1/axpy_usm.cpp
new file mode 100644
index 000000000..a67779dff
--- /dev/null
+++ b/tests/unit_tests/blas/level1/axpy_usm.cpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int N, int incx, int incy, fp alpha) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during AXPY:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua);
+
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    auto y_ref = y;
+
+    // Call Reference AXPY.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    ::axpy(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y_ref.data(),
+           &incy_ref);
+
+    // Call DPC++ AXPY.
+
+    try {
+#ifdef CALL_RT_API
+        done =
+            onemkl::blas::axpy(main_queue, N, alpha, x.data(), incx, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::axpy,
+                    (main_queue, N, alpha, x.data(), incx, y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during AXPY:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, N, incy, N, std::cout);
+
+    return (int)good;
+}
+
+class AxpyUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(AxpyUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3, -2, alpha));
+}
+TEST_P(AxpyUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3, -2, alpha));
+}
+TEST_P(AxpyUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2, alpha));
+}
+TEST_P(AxpyUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2, alpha));
+}
+
+INSTANTIATE_TEST_SUITE_P(AxpyUsmTestSuite, AxpyUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/copy.cpp b/tests/unit_tests/blas/level1/copy.cpp
index 4c25d9c43..fdba79042 100644
--- a/tests/unit_tests/blas/level1/copy.cpp
+++ b/tests/unit_tests/blas/level1/copy.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, int N, int incx, int incy) {
+int test(const device& dev, int N, int incx, int incy) {
     // Prepare data.
     vector<fp> x, y, y_ref;
 
@@ -91,6 +91,14 @@ bool test(const device& dev, int N, int incx, int incy) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of COPY:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -98,30 +106,30 @@ bool test(const device& dev, int N, int incx, int incy) {
         good            = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class CopyTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(CopyTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3, -2));
 }
 TEST_P(CopyTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3, -2));
 }
 TEST_P(CopyTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2));
 }
 TEST_P(CopyTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2));
 }
 
 INSTANTIATE_TEST_SUITE_P(CopyTestSuite, CopyTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/copy_usm.cpp b/tests/unit_tests/blas/level1/copy_usm.cpp
new file mode 100644
index 000000000..5a528ebbb
--- /dev/null
+++ b/tests/unit_tests/blas/level1/copy_usm.cpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, int N, int incx, int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during COPY:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua);
+
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    auto y_ref = y;
+
+    // Call Reference COPY.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    ::copy(&N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref);
+
+    // Call DPC++ COPY.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::copy(main_queue, N, x.data(), incx, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::copy,
+                    (main_queue, N, x.data(), incx, y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during COPY:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of COPY:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, N, incy, N, std::cout);
+
+    return (int)good;
+}
+
+class CopyUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(CopyUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3, -2));
+}
+TEST_P(CopyUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3, -2));
+}
+TEST_P(CopyUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2));
+}
+TEST_P(CopyUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2));
+}
+
+INSTANTIATE_TEST_SUITE_P(CopyUsmTestSuite, CopyUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dot.cpp b/tests/unit_tests/blas/level1/dot.cpp
index fbf65a482..3603559b5 100644
--- a/tests/unit_tests/blas/level1/dot.cpp
+++ b/tests/unit_tests/blas/level1/dot.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_res>
-bool test(const device& dev, int N, int incx, int incy) {
+int test(const device& dev, int N, int incx, int incy) {
     // Prepare data.
     vector<fp> x, y;
     fp_res result = fp_res(-1), result_ref = fp_res(-1);
@@ -91,6 +91,14 @@ bool test(const device& dev, int N, int incx, int incy) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of DOT:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -98,25 +106,25 @@ bool test(const device& dev, int N, int incx, int incy) {
         good                 = check_equal(result_accessor[0], result_ref, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class DotTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(DotTests, RealSinglePrecision) {
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, 2, 3)));
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, 1, 1)));
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, -3, -2)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2, 3)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 1, 1)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -3, -2)));
 }
 TEST_P(DotTests, RealDoublePrecision) {
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, 2, 3)));
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, 1, 1)));
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, -3, -2)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2, 3)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 1, 1)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -3, -2)));
 }
 //TEST_P(DotTests, RealDoubleSinglePrecision) {
-//    EXPECT_TRUE((test<float, double>(GetParam(), 1357, 2, 3)));
-//    EXPECT_TRUE((test<float, double>(GetParam(), 1357, 1, 1)));
-//    EXPECT_TRUE((test<float, double>(GetParam(), 1357, -3, -2)));
+//    EXPECT_TRUEORSKIP((test<float, double>(GetParam(), 1357, 2, 3)));
+//    EXPECT_TRUEORSKIP((test<float, double>(GetParam(), 1357, 1, 1)));
+//    EXPECT_TRUEORSKIP((test<float, double>(GetParam(), 1357, -3, -2)));
 //}
 
 INSTANTIATE_TEST_SUITE_P(DotTestSuite, DotTests, ::testing::ValuesIn(devices), ::DeviceNamePrint());
diff --git a/tests/unit_tests/blas/level1/dot_usm.cpp b/tests/unit_tests/blas/level1/dot_usm.cpp
new file mode 100644
index 000000000..e5246df6f
--- /dev/null
+++ b/tests/unit_tests/blas/level1/dot_usm.cpp
@@ -0,0 +1,136 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_res>
+int test(const device& dev, int N, int incx, int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during DOT:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua);
+    fp_res result_ref = fp_res(-1);
+
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    // Call Reference DOT.
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    result_ref = ::dot<fp, fp_res>(&N_ref, (fp*)x.data(), &incx_ref, (fp*)y.data(), &incy_ref);
+
+    // Call DPC++ DOT.
+
+    auto result_p = (fp_res*)onemkl::malloc_shared(64, sizeof(fp_res), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::dot(main_queue, N, x.data(), incx, y.data(), incy, result_p,
+                                 dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::dot,
+                    (main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during DOT:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of DOT:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_ref, N, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+    return (int)good;
+}
+
+class DotUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(DotUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2, 3)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 1, 1)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -3, -2)));
+}
+TEST_P(DotUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2, 3)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 1, 1)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -3, -2)));
+}
+//TEST_P(DotUsmTests, RealDoubleSinglePrecision) {
+//    EXPECT_TRUEORSKIP((test<float, double>(GetParam(), 1357, 2, 3)));
+//    EXPECT_TRUEORSKIP((test<float, double>(GetParam(), 1357, 1, 1)));
+//    EXPECT_TRUEORSKIP((test<float, double>(GetParam(), 1357, -3, -2)));
+//}
+
+INSTANTIATE_TEST_SUITE_P(DotUsmTestSuite, DotUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dotc.cpp b/tests/unit_tests/blas/level1/dotc.cpp
index c3cd2d88d..5cb69d1e2 100644
--- a/tests/unit_tests/blas/level1/dotc.cpp
+++ b/tests/unit_tests/blas/level1/dotc.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int N, int incx, int incy) {
+int test(const device &dev, int N, int incx, int incy) {
     // Prepare data.
     vector<fp> x, y;
     fp result = 0.0, result_reference = 0.0;
@@ -93,6 +93,14 @@ bool test(const device &dev, int N, int incx, int incy) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -100,20 +108,20 @@ bool test(const device &dev, int N, int incx, int incy) {
         good                 = check_equal(result_accessor[0], result_reference, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class DotcTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(DotcTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2));
 }
 TEST_P(DotcTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2));
 }
 
 INSTANTIATE_TEST_SUITE_P(DotcTestSuite, DotcTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/dotc_usm.cpp b/tests/unit_tests/blas/level1/dotc_usm.cpp
new file mode 100644
index 000000000..155565a1d
--- /dev/null
+++ b/tests/unit_tests/blas/level1/dotc_usm.cpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int N, int incx, int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during DOTC:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua);
+    fp result_reference = 0.0;
+
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    // Call Reference DOTC.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    ::dotc((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
+           &incy_ref);
+
+    // Call DPC++ DOTC.
+
+    auto result_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::dotc(main_queue, N, x.data(), incx, y.data(), incy, result_p,
+                                  dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::dotc,
+                    (main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during DOTC:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_reference, N, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+
+    return (int)good;
+}
+
+class DotcUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(DotcUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2));
+}
+TEST_P(DotcUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2));
+}
+
+INSTANTIATE_TEST_SUITE_P(DotcUsmTestSuite, DotcUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dotu.cpp b/tests/unit_tests/blas/level1/dotu.cpp
index f0aa503a8..f6237746c 100644
--- a/tests/unit_tests/blas/level1/dotu.cpp
+++ b/tests/unit_tests/blas/level1/dotu.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int N, int incx, int incy) {
+int test(const device &dev, int N, int incx, int incy) {
     // Prepare data.
     vector<fp> x, y;
     fp result = 0.0, result_reference = 0.0;
@@ -93,6 +93,14 @@ bool test(const device &dev, int N, int incx, int incy) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -100,20 +108,20 @@ bool test(const device &dev, int N, int incx, int incy) {
         good                 = check_equal(result_accessor[0], result_reference, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class DotuTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(DotuTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2));
 }
 TEST_P(DotuTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 1, 1));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, -3, -2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2));
 }
 
 INSTANTIATE_TEST_SUITE_P(DotuTestSuite, DotuTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/dotu_usm.cpp b/tests/unit_tests/blas/level1/dotu_usm.cpp
new file mode 100644
index 000000000..17658c56e
--- /dev/null
+++ b/tests/unit_tests/blas/level1/dotu_usm.cpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int N, int incx, int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during DOTU:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua);
+    fp result_reference = 0.0;
+
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    // Call Reference DOTU.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    ::dotu((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
+           &incy_ref);
+
+    // Call DPC++ DOTU.
+
+    auto result_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::dotu(main_queue, N, x.data(), incx, y.data(), incy, result_p,
+                                  dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::dotu,
+                    (main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during DOTU:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_reference, N, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+    return (int)good;
+}
+
+class DotuUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(DotuUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3, -2));
+}
+TEST_P(DotuUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3, -2));
+}
+
+INSTANTIATE_TEST_SUITE_P(DotuUsmTestSuite, DotuUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/iamax.cpp b/tests/unit_tests/blas/level1/iamax.cpp
index de85b9bd7..949322243 100644
--- a/tests/unit_tests/blas/level1/iamax.cpp
+++ b/tests/unit_tests/blas/level1/iamax.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, int N, int incx) {
+int test(const device& dev, int N, int incx) {
     // Prepare data.
     vector<fp> x;
     int64_t result = -1, result_ref = -1;
@@ -89,6 +89,14 @@ bool test(const device& dev, int N, int incx) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of IAMAX:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -96,30 +104,30 @@ bool test(const device& dev, int N, int incx) {
         good                 = check_equal(result_accessor[0], result_ref, 0, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class IamaxTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(IamaxTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3));
 }
 TEST_P(IamaxTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3));
 }
 TEST_P(IamaxTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3));
 }
 TEST_P(IamaxTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3));
 }
 
 INSTANTIATE_TEST_SUITE_P(IamaxTestSuite, IamaxTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/iamax_usm.cpp b/tests/unit_tests/blas/level1/iamax_usm.cpp
new file mode 100644
index 000000000..08b3d4fd5
--- /dev/null
+++ b/tests/unit_tests/blas/level1/iamax_usm.cpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, int N, int incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during IAMAX:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua);
+    int64_t result_ref = -1;
+    rand_vector(x, N, incx);
+
+    // Call Reference IAMAX.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx;
+
+    result_ref = ::iamax(&N_ref, (fp_ref*)x.data(), &incx_ref);
+
+    // Call DPC++ IAMAX.
+
+    auto result_p = (int64_t*)onemkl::malloc_shared(64, sizeof(int64_t), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::iamax(main_queue, N, x.data(), incx, result_p, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::iamax,
+                    (main_queue, N, x.data(), incx, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during IAMAX:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of IAMAX:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_ref, 0, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+    return (int)good;
+}
+
+class IamaxUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(IamaxUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3));
+}
+TEST_P(IamaxUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3));
+}
+TEST_P(IamaxUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3));
+}
+TEST_P(IamaxUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3));
+}
+
+INSTANTIATE_TEST_SUITE_P(IamaxUsmTestSuite, IamaxUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/iamin.cpp b/tests/unit_tests/blas/level1/iamin.cpp
index 57aa1bb89..1345b3dfa 100644
--- a/tests/unit_tests/blas/level1/iamin.cpp
+++ b/tests/unit_tests/blas/level1/iamin.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, int N, int incx) {
+int test(const device& dev, int N, int incx) {
     // Prepare data.
     vector<fp> x;
     int64_t result = -1, result_ref = -1;
@@ -89,6 +89,14 @@ bool test(const device& dev, int N, int incx) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of IAMIN:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -96,30 +104,30 @@ bool test(const device& dev, int N, int incx) {
         good                 = check_equal(result_accessor[0], result_ref, 0, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class IaminTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(IaminTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3));
 }
 TEST_P(IaminTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3));
 }
 TEST_P(IaminTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3));
 }
 TEST_P(IaminTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 1));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3));
 }
 
 INSTANTIATE_TEST_SUITE_P(IaminTestSuite, IaminTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/iamin_usm.cpp b/tests/unit_tests/blas/level1/iamin_usm.cpp
new file mode 100644
index 000000000..617efd6b6
--- /dev/null
+++ b/tests/unit_tests/blas/level1/iamin_usm.cpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, int N, int incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during IAMIN:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua);
+    int64_t result_ref = -1;
+    rand_vector(x, N, incx);
+
+    // Call Reference IAMIN.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx;
+
+    result_ref = ::iamin(&N_ref, (fp_ref*)x.data(), &incx_ref);
+
+    // Call DPC++ IAMIN.
+
+    auto result_p = (int64_t*)onemkl::malloc_shared(64, sizeof(int64_t), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::iamin(main_queue, N, x.data(), incx, result_p, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::iamin,
+                    (main_queue, N, x.data(), incx, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during IAMIN:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of IAMIN:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_ref, 0, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+    return (int)good;
+}
+
+class IaminUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(IaminUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -3));
+}
+TEST_P(IaminUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -3));
+}
+TEST_P(IaminUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -3));
+}
+TEST_P(IaminUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -3));
+}
+
+INSTANTIATE_TEST_SUITE_P(IaminUsmTestSuite, IaminUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/nrm2.cpp b/tests/unit_tests/blas/level1/nrm2.cpp
index ed2099b80..5e2ee3967 100644
--- a/tests/unit_tests/blas/level1/nrm2.cpp
+++ b/tests/unit_tests/blas/level1/nrm2.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_res>
-bool test(const device& dev, int N, int incx) {
+int test(const device& dev, int N, int incx) {
     // Prepare data.
     vector<fp> x;
     fp_res result = fp_res(-1), result_ref = fp_res(-1);
@@ -89,6 +89,14 @@ bool test(const device& dev, int N, int incx) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of NRM2:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -96,30 +104,30 @@ bool test(const device& dev, int N, int incx) {
         good                 = check_equal(result_accessor[0], result_ref, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class Nrm2Tests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(Nrm2Tests, RealSinglePrecision) {
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -3)));
 }
 TEST_P(Nrm2Tests, RealDoublePrecision) {
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -3)));
 }
 TEST_P(Nrm2Tests, ComplexSinglePrecision) {
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, -3)));
 }
 TEST_P(Nrm2Tests, ComplexDoublePrecision) {
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, 2)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, 1)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, -3)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -3)));
 }
 
 INSTANTIATE_TEST_SUITE_P(Nrm2TestSuite, Nrm2Tests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/nrm2_usm.cpp b/tests/unit_tests/blas/level1/nrm2_usm.cpp
new file mode 100644
index 000000000..4b82162ee
--- /dev/null
+++ b/tests/unit_tests/blas/level1/nrm2_usm.cpp
@@ -0,0 +1,140 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_res>
+int test(const device& dev, int N, int incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during NRM2:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua);
+    fp_res result_ref = fp_res(-1);
+
+    rand_vector(x, N, incx);
+
+    // Call Reference NRM2.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = std::abs(incx);
+
+    result_ref = ::nrm2<fp_ref, fp_res>(&N_ref, (fp_ref*)x.data(), &incx_ref);
+
+    // Call DPC++ NRM2.
+
+    auto result_p = (fp_res*)onemkl::malloc_shared(64, sizeof(fp_res), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::nrm2(main_queue, N, x.data(), incx, result_p, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::nrm2,
+                    (main_queue, N, x.data(), incx, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during NRM2:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of NRM2:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_ref, N, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+    return (int)good;
+}
+
+class Nrm2UsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(Nrm2UsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -3)));
+}
+TEST_P(Nrm2UsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -3)));
+}
+TEST_P(Nrm2UsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, -3)));
+}
+TEST_P(Nrm2UsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 1)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -3)));
+}
+
+INSTANTIATE_TEST_SUITE_P(Nrm2UsmTestSuite, Nrm2UsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rot.cpp b/tests/unit_tests/blas/level1/rot.cpp
index b74229302..0a3c8c603 100644
--- a/tests/unit_tests/blas/level1/rot.cpp
+++ b/tests/unit_tests/blas/level1/rot.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_scalar>
-bool test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s) {
+int test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s) {
     // Prepare data.
     vector<fp> x, x_ref, y, y_ref;
     rand_vector(x, N, incx);
@@ -92,6 +92,14 @@ bool test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -102,7 +110,7 @@ bool test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s
         good            = good_x && good_y;
     }
 
-    return good;
+    return (int)good;
 }
 
 class RotTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -110,30 +118,30 @@ class RotTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(RotTests, RealSinglePrecision) {
     float c(2.0);
     float s(-0.5);
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, 2, 3, c, s)));
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, 1, 1, c, s)));
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, -2, -3, c, s)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -2, -3, c, s)));
 }
 TEST_P(RotTests, RealDoublePrecision) {
     double c(2.0);
     double s(-0.5);
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, 2, 3, c, s)));
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, 1, 1, c, s)));
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, -2, -3, c, s)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -2, -3, c, s)));
 }
 TEST_P(RotTests, ComplexSinglePrecision) {
     float c = 2.0;
     float s = -0.5;
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, 2, 3, c, s)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, 1, 1, c, s)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, -2, -3, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, -2, -3, c, s)));
 }
 TEST_P(RotTests, ComplexDoublePrecision) {
     double c = 2.0;
     double s = -0.5;
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, 2, 3, c, s)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, 1, 1, c, s)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, -2, -3, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -2, -3, c, s)));
 }
 
 INSTANTIATE_TEST_SUITE_P(RotTestSuite, RotTests, ::testing::ValuesIn(devices), ::DeviceNamePrint());
diff --git a/tests/unit_tests/blas/level1/rot_usm.cpp b/tests/unit_tests/blas/level1/rot_usm.cpp
new file mode 100644
index 000000000..fe1dd83a9
--- /dev/null
+++ b/tests/unit_tests/blas/level1/rot_usm.cpp
@@ -0,0 +1,150 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_scalar>
+int test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during ROT:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua);
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    auto x_ref = x;
+    auto y_ref = y;
+
+    // Call Reference ROT.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    ::rot(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref,
+          (fp_scalar *)&c, (fp_scalar *)&s);
+
+    // Call DPC++ ROT.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::rot(main_queue, N, x.data(), incx, y.data(), incy, c, s, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::rot,
+                    (main_queue, N, x.data(), incx, y.data(), incy, c, s, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during ROT:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout);
+    bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout);
+    bool good   = good_x && good_y;
+
+    return (int)good;
+}
+
+class RotUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(RotUsmTests, RealSinglePrecision) {
+    float c(2.0);
+    float s(-0.5);
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -2, -3, c, s)));
+}
+TEST_P(RotUsmTests, RealDoublePrecision) {
+    double c(2.0);
+    double s(-0.5);
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -2, -3, c, s)));
+}
+TEST_P(RotUsmTests, ComplexSinglePrecision) {
+    float c = 2.0;
+    float s = -0.5;
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, -2, -3, c, s)));
+}
+TEST_P(RotUsmTests, ComplexDoublePrecision) {
+    double c = 2.0;
+    double s = -0.5;
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2, 3, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 1, 1, c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -2, -3, c, s)));
+}
+
+INSTANTIATE_TEST_SUITE_P(RotUsmTestSuite, RotUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotg.cpp b/tests/unit_tests/blas/level1/rotg.cpp
index b71695157..f88d08fc4 100644
--- a/tests/unit_tests/blas/level1/rotg.cpp
+++ b/tests/unit_tests/blas/level1/rotg.cpp
@@ -42,15 +42,20 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_scalar>
-bool test(const device &dev, fp s, fp_scalar c) {
+int test(const device &dev) {
     // Prepare data.
-    fp a, b, a_ref, b_ref, s_ref;
-    fp_scalar c_ref;
+    fp a, b, s, a_ref, b_ref, s_ref;
+    fp_scalar c, c_ref;
+
+    a = rand_scalar<fp>();
+    b = rand_scalar<fp>();
+    s = rand_scalar<fp>();
+    c = rand_scalar<fp_scalar>();
 
-    a     = rand_scalar<fp>();
-    b     = rand_scalar<fp>();
     a_ref = a;
     b_ref = b;
+    s_ref = s;
+    c_ref = c;
 
     // Call Reference ROTG.
     using fp_ref = typename ref_type_info<fp>::type;
@@ -94,6 +99,14 @@ bool test(const device &dev, fp s, fp_scalar c) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -109,38 +122,30 @@ bool test(const device &dev, fp s, fp_scalar c) {
         good = good_a && good_b && good_c && good_s;
     }
 
-    return good;
+    return (int)good;
 }
 
 class RotgTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(RotgTests, RealSinglePrecision) {
-    float c(2.0);
-    float s(-0.5);
-    EXPECT_TRUE((test<float, float>(GetParam(), c, s)));
-    EXPECT_TRUE((test<float, float>(GetParam(), c, s)));
-    EXPECT_TRUE((test<float, float>(GetParam(), c, s)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam())));
 }
 TEST_P(RotgTests, RealDoublePrecision) {
-    double c(2.0);
-    double s(-0.5);
-    EXPECT_TRUE((test<double, double>(GetParam(), c, s)));
-    EXPECT_TRUE((test<double, double>(GetParam(), c, s)));
-    EXPECT_TRUE((test<double, double>(GetParam(), c, s)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam())));
 }
 TEST_P(RotgTests, ComplexSinglePrecision) {
-    float c = 2.0;
-    float s = -0.5;
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), c, s)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), c, s)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam())));
 }
 TEST_P(RotgTests, ComplexDoublePrecision) {
-    double c = 2.0;
-    double s = -0.5;
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), c, s)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), c, s)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), c, s)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam())));
 }
 
 INSTANTIATE_TEST_SUITE_P(RotgTestSuite, RotgTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/rotg_usm.cpp b/tests/unit_tests/blas/level1/rotg_usm.cpp
new file mode 100644
index 000000000..aa399c77a
--- /dev/null
+++ b/tests/unit_tests/blas/level1/rotg_usm.cpp
@@ -0,0 +1,158 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_scalar>
+int test(const device &dev) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during ROTG:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+
+    fp a, b, s, a_ref, b_ref, s_ref;
+    fp_scalar c, c_ref;
+
+    a     = rand_scalar<fp>();
+    b     = rand_scalar<fp>();
+    s     = rand_scalar<fp>();
+    c     = rand_scalar<fp_scalar>();
+    a_ref = a;
+    b_ref = b;
+
+    // Call Reference ROTG.
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::rotg((fp_ref *)&a_ref, (fp_ref *)&b_ref, (fp_scalar *)&c_ref, (fp_ref *)&s_ref);
+
+    // Call DPC++ ROTG.
+    fp *a_p        = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+    fp *b_p        = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+    fp *s_p        = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+    fp_scalar *c_p = (fp_scalar *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+
+    a_p[0] = a;
+    b_p[0] = b;
+    s_p[0] = s;
+    c_p[0] = c;
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::rotg(main_queue, a_p, b_p, c_p, s_p, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::rotg, (main_queue, a_p, b_p, c_p, s_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during ROTG:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good_a = check_equal(a_p[0], a_ref, 4, std::cout);
+    bool good_b = check_equal(b_p[0], b_ref, 4, std::cout);
+    bool good_s = check_equal(s_p[0], s_ref, 4, std::cout);
+    bool good_c = check_equal(c_p[0], c_ref, 4, std::cout);
+
+    bool good = good_a && good_b && good_c && good_s;
+
+    onemkl::free_shared(a_p, cxt);
+    onemkl::free_shared(b_p, cxt);
+    onemkl::free_shared(s_p, cxt);
+    onemkl::free_shared(c_p, cxt);
+    return (int)good;
+}
+
+class RotgUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(RotgUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam())));
+}
+TEST_P(RotgUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam())));
+}
+TEST_P(RotgUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam())));
+}
+TEST_P(RotgUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam())));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam())));
+}
+
+INSTANTIATE_TEST_SUITE_P(RotgUsmTestSuite, RotgUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotm.cpp b/tests/unit_tests/blas/level1/rotm.cpp
index 50af06d66..8c33e6333 100644
--- a/tests/unit_tests/blas/level1/rotm.cpp
+++ b/tests/unit_tests/blas/level1/rotm.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int N, int incx, int incy, fp flag) {
+int test(const device &dev, int N, int incx, int incy, fp flag) {
     // Prepare data.
     vector<fp> x, x_ref, y, y_ref;
     vector<fp> param;
@@ -96,6 +96,14 @@ bool test(const device &dev, int N, int incx, int incy, fp flag) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -106,46 +114,46 @@ bool test(const device &dev, int N, int incx, int incy, fp flag) {
         good            = good_x && good_y;
     }
 
-    return good;
+    return (int)good;
 }
 
 class RotmTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(RotmTests, RealSinglePrecision) {
     float flag(-1.0);
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
     flag = 0.0;
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
     flag = 1.0;
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
     flag = -2.0;
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
 }
 TEST_P(RotmTests, RealDoublePrecision) {
     double flag(-1.0);
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
     flag = 0.0;
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
     flag = 1.0;
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
     flag = -2.0;
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2, 3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -2, -3, flag));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1, 1, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
 }
 
 INSTANTIATE_TEST_SUITE_P(RotmTestSuite, RotmTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/rotm_usm.cpp b/tests/unit_tests/blas/level1/rotm_usm.cpp
new file mode 100644
index 000000000..0c438c383
--- /dev/null
+++ b/tests/unit_tests/blas/level1/rotm_usm.cpp
@@ -0,0 +1,161 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int N, int incx, int incy, fp flag) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during ROTM:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), param(ua);
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+    rand_vector(param, 5, 1);
+    param[0] = flag;
+
+    auto x_ref = x;
+    auto y_ref = y;
+
+    // Call Reference ROTM.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    ::rotm(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref,
+           (fp_ref *)param.data());
+
+    // Call DPC++ ROTM.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::rotm(main_queue, N, x.data(), incx, y.data(), incy, param.data(),
+                                  dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::rotm,
+                    (main_queue, N, x.data(), incx, y.data(), incy, param.data(), dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during ROTM:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout);
+    bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout);
+    bool good   = good_x && good_y;
+
+    return (int)good;
+}
+
+class RotmUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(RotmUsmTests, RealSinglePrecision) {
+    float flag(-1.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
+    flag = 0.0;
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
+    flag = 1.0;
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
+    flag = -2.0;
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1, flag));
+}
+TEST_P(RotmUsmTests, RealDoublePrecision) {
+    double flag(-1.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
+    flag = 0.0;
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
+    flag = 1.0;
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
+    flag = -2.0;
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3, flag));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1, flag));
+}
+
+INSTANTIATE_TEST_SUITE_P(RotmUsmTestSuite, RotmUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotmg.cpp b/tests/unit_tests/blas/level1/rotmg.cpp
index 8bbe5a13d..90e907d3b 100644
--- a/tests/unit_tests/blas/level1/rotmg.cpp
+++ b/tests/unit_tests/blas/level1/rotmg.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev) {
+int test(const device& dev) {
     // Prepare data.
     fp d1, d2, x1, y1, d1_ref, d2_ref, x1_ref;
     vector<fp> param(5, fp(0)), param_ref(5, fp(0));
@@ -96,6 +96,14 @@ bool test(const device& dev) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of ROTMG:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -110,16 +118,16 @@ bool test(const device& dev) {
         good                = good_d1 && good_d2 && good_x1 && good_param;
     }
 
-    return good;
+    return (int)good;
 }
 
 class RotmgTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(RotmgTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam()));
+    EXPECT_TRUEORSKIP(test<float>(GetParam()));
 }
 TEST_P(RotmgTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam()));
+    EXPECT_TRUEORSKIP(test<double>(GetParam()));
 }
 
 INSTANTIATE_TEST_SUITE_P(RotmgTestSuite, RotmgTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/rotmg_usm.cpp b/tests/unit_tests/blas/level1/rotmg_usm.cpp
new file mode 100644
index 000000000..86585b831
--- /dev/null
+++ b/tests/unit_tests/blas/level1/rotmg_usm.cpp
@@ -0,0 +1,141 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during ROTMG:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> param(5, fp(0), ua), param_ref(5, fp(0), ua);
+    fp d1, d2, x1, y1, d1_ref, d2_ref, x1_ref;
+
+    d1     = rand_scalar<fp>();
+    d1     = abs(d1);
+    d2     = rand_scalar<fp>();
+    x1     = rand_scalar<fp>();
+    y1     = rand_scalar<fp>();
+    d1_ref = d1;
+    d2_ref = d2;
+    x1_ref = x1;
+
+    // Call Reference ROTMG.
+
+    ::rotmg(&d1_ref, &d2_ref, &x1_ref, &y1, (fp *)param_ref.data());
+
+    // Call DPC++ ROTMG.
+    fp *d1_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+    fp *d2_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+    fp *x1_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt);
+    d1_p[0]  = d1;
+    d2_p[0]  = d2;
+    x1_p[0]  = x1;
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::rotmg(main_queue, d1_p, d2_p, x1_p, y1, param.data(), dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::rotmg,
+                    (main_queue, d1_p, d2_p, x1_p, y1, param.data(), dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during ROTMG:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of ROTMG:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good_d1    = check_equal(d1_p[0], d1_ref, 1, std::cout);
+    bool good_d2    = check_equal(d2_p[0], d2_ref, 1, std::cout);
+    bool good_x1    = check_equal(x1_p[0], x1_ref, 1, std::cout);
+    bool good_param = check_equal_vector(param, param_ref, 5, 1, 1, std::cout);
+    bool good       = good_d1 && good_d2 && good_x1 && good_param;
+
+    onemkl::free_shared(d1_p, cxt);
+    onemkl::free_shared(d2_p, cxt);
+    onemkl::free_shared(x1_p, cxt);
+    return (int)good;
+}
+
+class RotmgUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(RotmgUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam()));
+}
+TEST_P(RotmgUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(RotmgUsmTestSuite, RotmgUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/scal.cpp b/tests/unit_tests/blas/level1/scal.cpp
index 3c384dde9..4c375c0ba 100644
--- a/tests/unit_tests/blas/level1/scal.cpp
+++ b/tests/unit_tests/blas/level1/scal.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_scalar>
-bool test(const device& dev, int N, int incx, fp_scalar alpha) {
+int test(const device& dev, int N, int incx, fp_scalar alpha) {
     // Prepare data.
     vector<fp> x, x_ref;
 
@@ -90,6 +90,14 @@ bool test(const device& dev, int N, int incx, fp_scalar alpha) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SCAL:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -97,40 +105,43 @@ bool test(const device& dev, int N, int incx, fp_scalar alpha) {
         good            = check_equal_vector(x_accessor, x_ref, N, incx, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class ScalTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(ScalTests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, 2, alpha)));
-    EXPECT_TRUE((test<float, float>(GetParam(), 1357, -3, alpha)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -3, alpha)));
 }
 TEST_P(ScalTests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, 2, alpha)));
-    EXPECT_TRUE((test<double, double>(GetParam(), 1357, -3, alpha)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -3, alpha)));
 }
 TEST_P(ScalTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(GetParam(), 1357, 2, alpha)));
-    EXPECT_TRUE((test<std::complex<float>, std::complex<float>>(GetParam(), 1357, -3, alpha)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, std::complex<float>>(GetParam(), 1357, -3, alpha)));
 }
 TEST_P(ScalTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(GetParam(), 1357, 2, alpha)));
-    EXPECT_TRUE((test<std::complex<double>, std::complex<double>>(GetParam(), 1357, -3, alpha)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, std::complex<double>>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, std::complex<double>>(GetParam(), 1357, -3, alpha)));
 }
 TEST_P(ScalTests, ComplexRealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, 2, alpha)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), 1357, -3, alpha)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, -3, alpha)));
 }
 TEST_P(ScalTests, ComplexRealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, 2, alpha)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), 1357, -3, alpha)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -3, alpha)));
 }
 
 INSTANTIATE_TEST_SUITE_P(ScalTestSuite, ScalTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/scal_usm.cpp b/tests/unit_tests/blas/level1/scal_usm.cpp
new file mode 100644
index 000000000..f5df7913b
--- /dev/null
+++ b/tests/unit_tests/blas/level1/scal_usm.cpp
@@ -0,0 +1,153 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_scalar>
+int test(const device& dev, int N, int incx, fp_scalar alpha) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during SCAL:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua);
+
+    rand_vector(x, N, incx);
+
+    auto x_ref = x;
+
+    // Call Reference SCAL.
+    using fp_ref        = typename ref_type_info<fp>::type;
+    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
+
+    const int N_ref = N, incx_ref = std::abs(incx);
+
+    ::scal(&N_ref, (fp_scalar_mkl*)&alpha, (fp_ref*)x_ref.data(), &incx_ref);
+
+    // Call DPC++ SCAL.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::scal(main_queue, N, alpha, x.data(), incx, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::scal,
+                    (main_queue, N, alpha, x.data(), incx, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during SCAL:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SCAL:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(x, x_ref, N, incx, N, std::cout);
+
+    return (int)good;
+}
+
+class ScalUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(ScalUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<float, float>(GetParam(), 1357, -3, alpha)));
+}
+TEST_P(ScalUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<double, double>(GetParam(), 1357, -3, alpha)));
+}
+TEST_P(ScalUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, std::complex<float>>(GetParam(), 1357, -3, alpha)));
+}
+TEST_P(ScalUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, std::complex<double>>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, std::complex<double>>(GetParam(), 1357, -3, alpha)));
+}
+TEST_P(ScalUsmTests, ComplexRealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), 1357, -3, alpha)));
+}
+TEST_P(ScalUsmTests, ComplexRealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, 2, alpha)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), 1357, -3, alpha)));
+}
+
+INSTANTIATE_TEST_SUITE_P(ScalUsmTestSuite, ScalUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/sdsdot.cpp b/tests/unit_tests/blas/level1/sdsdot.cpp
index a6720a3e9..a5bdc9c89 100644
--- a/tests/unit_tests/blas/level1/sdsdot.cpp
+++ b/tests/unit_tests/blas/level1/sdsdot.cpp
@@ -41,7 +41,7 @@ extern std::vector<cl::sycl::device> devices;
 
 namespace {
 
-bool test(const device &dev, int N, int incx, int incy, float alpha) {
+int test(const device &dev, int N, int incx, int incy, float alpha) {
     // Prepare data.
     vector<float> x, y;
     float result = float(-1), result_ref = float(-1);
@@ -91,6 +91,14 @@ bool test(const device &dev, int N, int incx, int incy, float alpha) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -98,15 +106,15 @@ bool test(const device &dev, int N, int incx, int incy, float alpha) {
         good                 = check_equal(result_accessor[0], result_ref, N, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SdsdotTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(SdsdotTests, RealSinglePrecision) {
-    EXPECT_TRUE(test(GetParam(), 1357, 2, 3, 2.0));
-    EXPECT_TRUE(test(GetParam(), 1357, -2, -3, 2.0));
-    EXPECT_TRUE(test(GetParam(), 1357, 1, 1, 2.0));
+    EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, 2.0));
+    EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, 2.0));
+    EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, 2.0));
 }
 
 INSTANTIATE_TEST_SUITE_P(SdsdotTestSuite, SdsdotTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/sdsdot_usm.cpp b/tests/unit_tests/blas/level1/sdsdot_usm.cpp
new file mode 100644
index 000000000..a93a3d711
--- /dev/null
+++ b/tests/unit_tests/blas/level1/sdsdot_usm.cpp
@@ -0,0 +1,126 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+int test(const device &dev, int N, int incx, int incy, float alpha) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SDSDOT:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<float, usm::alloc::shared, 64>(cxt, dev);
+    vector<float, decltype(ua)> x(ua), y(ua);
+    float result_ref = float(-1);
+
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    // Call Reference SDSDOT.
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    result_ref = ::sdsdot(&N_ref, (float *)&alpha, (float *)x.data(), &incx_ref, (float *)y.data(),
+                          &incy_ref);
+
+    // Call DPC++ SDSDOT.
+
+    auto result_p = (float *)onemkl::malloc_shared(64, sizeof(float), dev, cxt);
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::sdsdot(main_queue, N, alpha, x.data(), incx, y.data(), incy, result_p,
+                                    dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::sdsdot,
+                    (main_queue, N, alpha, x.data(), incx, y.data(), incy, result_p, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SDSDOT:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal(*result_p, result_ref, N, std::cout);
+
+    onemkl::free_shared(result_p, cxt);
+    return (int)good;
+}
+
+class SdsdotUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SdsdotUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, 2.0));
+    EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, 2.0));
+    EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, 2.0));
+}
+
+INSTANTIATE_TEST_SUITE_P(SdsdotUsmTestSuite, SdsdotUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/swap.cpp b/tests/unit_tests/blas/level1/swap.cpp
index 0fd2407d2..b35293ea1 100644
--- a/tests/unit_tests/blas/level1/swap.cpp
+++ b/tests/unit_tests/blas/level1/swap.cpp
@@ -42,7 +42,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, int N, int incx, int incy) {
+int test(const device& dev, int N, int incx, int incy) {
     // Prepare data.
     vector<fp> x, x_ref, y, y_ref;
     rand_vector(x, N, incx);
@@ -91,6 +91,14 @@ bool test(const device& dev, int N, int incx, int incy) {
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SWAP:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -101,30 +109,30 @@ bool test(const device& dev, int N, int incx, int incy) {
         good            = good_x && good_y;
     }
 
-    return good;
+    return (int)good;
 }
 
 class SwapTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(SwapTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, -2, -3));
-    EXPECT_TRUE(test<float>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1));
 }
 TEST_P(SwapTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, -2, -3));
-    EXPECT_TRUE(test<double>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1));
 }
 TEST_P(SwapTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, -2, -3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
 }
 TEST_P(SwapTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 2, 3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, -2, -3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
 }
 
 INSTANTIATE_TEST_SUITE_P(SwapTestSuite, SwapTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level1/swap_usm.cpp b/tests/unit_tests/blas/level1/swap_usm.cpp
new file mode 100644
index 000000000..ad51f4acd
--- /dev/null
+++ b/tests/unit_tests/blas/level1/swap_usm.cpp
@@ -0,0 +1,141 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, int N, int incx, int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during SWAP:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua);
+    rand_vector(x, N, incx);
+    rand_vector(y, N, incy);
+
+    auto x_ref = x;
+    auto y_ref = y;
+
+    // Call Reference SWAP.
+    using fp_ref    = typename ref_type_info<fp>::type;
+    const int N_ref = N, incx_ref = incx, incy_ref = incy;
+
+    ::swap(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref);
+
+    // Call DPC++ SWAP.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::swap(main_queue, N, x.data(), incx, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::swap,
+                    (main_queue, N, x.data(), incx, y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during SWAP:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SWAP:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout);
+    bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout);
+    bool good   = good_x && good_y;
+
+    return (int)good;
+}
+
+class SwapUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SwapUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 1357, 1, 1));
+}
+TEST_P(SwapUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 1357, 1, 1));
+}
+TEST_P(SwapUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 1357, 1, 1));
+}
+TEST_P(SwapUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 1357, 1, 1));
+}
+
+INSTANTIATE_TEST_SUITE_P(SwapUsmTestSuite, SwapUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/CMakeLists.txt b/tests/unit_tests/blas/level2/CMakeLists.txt
index 7e2db862b..4facc1456 100644
--- a/tests/unit_tests/blas/level2/CMakeLists.txt
+++ b/tests/unit_tests/blas/level2/CMakeLists.txt
@@ -18,7 +18,7 @@
 #===============================================================================
 
 # Build object from all test sources
-set(L2_SOURCES "hpr2.cpp" "hpmv.cpp" "her.cpp" "her2.cpp" "hemv.cpp" "hbmv.cpp" "geru.cpp" "ger.cpp" "gerc.cpp" "gemv.cpp" "gbmv.cpp" "trsv.cpp" "trmv.cpp" "tpsv.cpp" "tpmv.cpp" "tbsv.cpp" "tbmv.cpp" "syr.cpp" "syr2.cpp" "symv.cpp" "spr.cpp" "spr2.cpp" "spmv.cpp" "sbmv.cpp" "hpr.cpp")
+set(L2_SOURCES "hpr2.cpp" "hpmv.cpp" "her.cpp" "her2.cpp" "hemv.cpp" "hbmv.cpp" "geru.cpp" "ger.cpp" "gerc.cpp" "gemv.cpp" "gbmv.cpp" "trsv.cpp" "trmv.cpp" "tpsv.cpp" "tpmv.cpp" "tbsv.cpp" "tbmv.cpp" "syr.cpp" "syr2.cpp" "symv.cpp" "spr.cpp" "spr2.cpp" "spmv.cpp" "sbmv.cpp" "hpr.cpp" "hpr2_usm.cpp" "hpmv_usm.cpp" "her_usm.cpp" "her2_usm.cpp" "hemv_usm.cpp" "hbmv_usm.cpp" "geru_usm.cpp" "ger_usm.cpp" "gerc_usm.cpp" "gemv_usm.cpp" "gbmv_usm.cpp" "trsv_usm.cpp" "trmv_usm.cpp" "tpsv_usm.cpp" "tpmv_usm.cpp" "tbsv_usm.cpp" "tbmv_usm.cpp" "syr_usm.cpp" "syr2_usm.cpp" "symv_usm.cpp" "spr_usm.cpp" "spr2_usm.cpp" "spmv_usm.cpp" "sbmv_usm.cpp" "hpr_usm.cpp")
 
 if(BUILD_SHARED_LIBS)
   add_library(blas_level2_rt OBJECT ${L2_SOURCES})
diff --git a/tests/unit_tests/blas/level2/gbmv.cpp b/tests/unit_tests/blas/level2/gbmv.cpp
index b03534b30..e16bbf180 100644
--- a/tests/unit_tests/blas/level2/gbmv.cpp
+++ b/tests/unit_tests/blas/level2/gbmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int ku, fp alpha,
-          fp beta, int incx, int incy, int lda) {
+int test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int ku, fp alpha,
+         fp beta, int incx, int incy, int lda) {
     // Prepare data.
     int x_len = outer_dimension(transa, m, n);
     int y_len = inner_dimension(transa, m, n);
@@ -103,6 +103,14 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -110,7 +118,7 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int
         good = check_equal_vector(y_accessor, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class GbmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -118,78 +126,78 @@ class GbmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(GbmvTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2,
-                            -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha,
+                                  beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
 }
 TEST_P(GbmvTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2,
-                             -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha,
+                                   beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
 }
 TEST_P(GbmvTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7,
-                                          alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7,
-                                          alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7,
-                                          alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                          beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                          beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                          beta, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7,
-                                          alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7,
-                                          alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7,
-                                          alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                7, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5,
+                                                7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5,
+                                                7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5,
+                                                7, alpha, beta, 1, 1, 42));
 }
 TEST_P(GbmvTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7,
-                                           alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7,
-                                           alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7,
-                                           alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
-                                           alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
-                                           alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
-                                           alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7,
-                                           alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7,
-                                           alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7,
-                                           alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                 7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                 7, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                 alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 5, 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 5, 7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 5, 7, alpha, beta, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(GbmvTestSuite, GbmvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/gbmv_usm.cpp b/tests/unit_tests/blas/level2/gbmv_usm.cpp
new file mode 100644
index 000000000..d6a7ebe58
--- /dev/null
+++ b/tests/unit_tests/blas/level2/gbmv_usm.cpp
@@ -0,0 +1,205 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int ku, fp alpha,
+         fp beta, int incx, int incy, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during GBMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    int x_len = outer_dimension(transa, m, n);
+    int y_len = inner_dimension(transa, m, n);
+
+    rand_vector(x, x_len, incx);
+    rand_vector(y, y_len, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, m, n, lda);
+
+    auto y_ref = y;
+
+    // Call Reference GBMV.
+    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    int kl_ref = kl, ku_ref = ku;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::gbmv(convert_to_cblas_trans(transa), &m_ref, &n_ref, &kl_ref, &ku_ref, (fp_ref *)&alpha,
+           (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta,
+           (fp_ref *)y_ref.data(), &incy_ref);
+
+    // Call DPC++ GBMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::gbmv(main_queue, transa, m, n, kl, ku, alpha, A.data(), lda, x.data(),
+                                  incx, beta, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::gbmv,
+                    (main_queue, transa, m, n, kl, ku, alpha, A.data(), lda, x.data(), incx, beta,
+                     y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during GBMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
+
+    return (int)good;
+}
+
+class GbmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GbmvUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha,
+                                  beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
+}
+TEST_P(GbmvUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha,
+                                   beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42));
+}
+TEST_P(GbmvUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                7, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5,
+                                                7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5,
+                                                7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5,
+                                                7, alpha, beta, 1, 1, 42));
+}
+TEST_P(GbmvUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                 7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5,
+                                                 7, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7,
+                                                 alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 5, 7, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 5, 7, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 5, 7, alpha, beta, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(GbmvUsmTestSuite, GbmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/gemv.cpp b/tests/unit_tests/blas/level2/gemv.cpp
index ed238d42b..8c2899c8f 100644
--- a/tests/unit_tests/blas/level2/gemv.cpp
+++ b/tests/unit_tests/blas/level2/gemv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, fp beta, int incx,
-          int incy, int lda) {
+int test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, fp beta, int incx,
+         int incy, int lda) {
     // Prepare data.
     int x_len = outer_dimension(transa, m, n);
     int y_len = inner_dimension(transa, m, n);
@@ -102,6 +102,14 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, f
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -109,7 +117,7 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, f
         good = check_equal_vector(y_accessor, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class GemvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -117,73 +125,78 @@ class GemvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(GemvTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
 }
 TEST_P(GemvTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
         test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
 }
 TEST_P(GemvTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha,
-                                          beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha,
-                                          beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha,
-                                          beta, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta,
-                                          2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta,
-                                          -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta,
-                                          1, 1, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha,
-                                          beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha,
-                                          beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha,
-                                          beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
+                                                beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
+                                                beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
+                                                beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                alpha, beta, 1, 1, 42));
 }
 TEST_P(GemvTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha,
-                                           beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha,
-                                           beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha,
-                                           beta, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
-                                           beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
-                                           beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
-                                           beta, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha,
-                                           beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha,
-                                           beta, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha,
-                                           beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                 alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30,
+                                                 alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 alpha, beta, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(GemvTestSuite, GemvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/gemv_usm.cpp b/tests/unit_tests/blas/level2/gemv_usm.cpp
new file mode 100644
index 000000000..bec22b770
--- /dev/null
+++ b/tests/unit_tests/blas/level2/gemv_usm.cpp
@@ -0,0 +1,204 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, fp beta, int incx,
+         int incy, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during GEMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    int x_len = outer_dimension(transa, m, n);
+    int y_len = inner_dimension(transa, m, n);
+
+    rand_vector(x, x_len, incx);
+    rand_vector(y, y_len, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, m, n, lda);
+
+    auto y_ref = y;
+
+    // Call Reference GEMV.
+    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::gemv(convert_to_cblas_trans(transa), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(),
+           &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(),
+           &incy_ref);
+
+    // Call DPC++ GEMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::gemv(main_queue, transa, m, n, alpha, A.data(), lda, x.data(), incx,
+                                  beta, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::gemv,
+                    (main_queue, transa, m, n, alpha, A.data(), lda, x.data(), incx, beta, y.data(),
+                     incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during GEMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
+
+    return (int)good;
+}
+
+class GemvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GemvUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<float>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
+}
+TEST_P(GemvUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
+}
+TEST_P(GemvUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
+                                                beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
+                                                beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans, 25, 30, alpha,
+                                                beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                alpha, beta, 1, 1, 42));
+}
+TEST_P(GemvUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans, 25, 30,
+                                                 alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans, 25, 30,
+                                                 alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans, 25, 30,
+                                                 alpha, beta, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(GemvUsmTestSuite, GemvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/ger.cpp b/tests/unit_tests/blas/level2/ger.cpp
index 4b05612f6..9bffff959 100644
--- a/tests/unit_tests/blas/level2/ger.cpp
+++ b/tests/unit_tests/blas/level2/ger.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
+int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
     // Prepare data.
 
     vector<fp> x, y, A_ref, A;
@@ -96,6 +96,14 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -103,22 +111,22 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda
         good = check_equal_matrix(A_accessor, A_ref, m, n, lda, std::max<int>(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class GerTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(GerTests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), 25, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 25, 30, alpha, 1, 1, 42));
 }
 TEST_P(GerTests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), 25, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 25, 30, alpha, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(GerTestSuite, GerTests, ::testing::ValuesIn(devices), ::DeviceNamePrint());
diff --git a/tests/unit_tests/blas/level2/ger_usm.cpp b/tests/unit_tests/blas/level2/ger_usm.cpp
new file mode 100644
index 000000000..810627c1c
--- /dev/null
+++ b/tests/unit_tests/blas/level2/ger_usm.cpp
@@ -0,0 +1,136 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during GER:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+
+    rand_vector(x, m, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, m, n, lda);
+
+    auto A_ref = A;
+
+    // Call Reference GER.
+    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::ger(&m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
+          &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
+
+    // Call DPC++ GER.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::ger(main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(),
+                                 lda, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(
+            main_queue, onemkl::blas::ger,
+            (main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), lda, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during GER:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, m, n, lda, std::max<int>(m, n), std::cout);
+
+    return (int)good;
+}
+
+class GerUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GerUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), 25, 30, alpha, 1, 1, 42));
+}
+TEST_P(GerUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), 25, 30, alpha, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(GerUsmTestSuite, GerUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/gerc.cpp b/tests/unit_tests/blas/level2/gerc.cpp
index 7ac1996ab..7d12c6b1a 100644
--- a/tests/unit_tests/blas/level2/gerc.cpp
+++ b/tests/unit_tests/blas/level2/gerc.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
+int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
     // Prepare data.
 
     vector<fp> x, y, A_ref, A;
@@ -96,6 +96,14 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -103,22 +111,22 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda
         good = check_equal_matrix(A_accessor, A_ref, m, n, lda, std::max<int>(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class GercTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(GercTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 1, 1, 42));
 }
 TEST_P(GercTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(GercTestSuite, GercTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/gerc_usm.cpp b/tests/unit_tests/blas/level2/gerc_usm.cpp
new file mode 100644
index 000000000..0a8139d7a
--- /dev/null
+++ b/tests/unit_tests/blas/level2/gerc_usm.cpp
@@ -0,0 +1,136 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during GERC:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+
+    rand_vector(x, m, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, m, n, lda);
+
+    auto A_ref = A;
+
+    // Call Reference GERC.
+    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::gerc(&m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
+           &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
+
+    // Call DPC++ GERC.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::gerc(main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(),
+                                  lda, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(
+            main_queue, onemkl::blas::gerc,
+            (main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), lda, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during GERC:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, m, n, lda, std::max<int>(m, n), std::cout);
+
+    return (int)good;
+}
+
+class GercUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GercUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+}
+TEST_P(GercUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(GercUsmTestSuite, GercUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/geru.cpp b/tests/unit_tests/blas/level2/geru.cpp
index 13b423116..de358687d 100644
--- a/tests/unit_tests/blas/level2/geru.cpp
+++ b/tests/unit_tests/blas/level2/geru.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
+int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
     // Prepare data.
 
     vector<fp> x, y, A_ref, A;
@@ -96,6 +96,14 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -103,22 +111,22 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda
         good = check_equal_matrix(A_accessor, A_ref, m, n, lda, std::max<int>(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class GeruTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(GeruTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 1, 1, 42));
 }
 TEST_P(GeruTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(GeruTestSuite, GeruTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/geru_usm.cpp b/tests/unit_tests/blas/level2/geru_usm.cpp
new file mode 100644
index 000000000..3edcfaf30
--- /dev/null
+++ b/tests/unit_tests/blas/level2/geru_usm.cpp
@@ -0,0 +1,136 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during GERU:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+
+    rand_vector(x, m, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, m, n, lda);
+
+    auto A_ref = A;
+
+    // Call Reference GERU.
+    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::geru(&m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
+           &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
+
+    // Call DPC++ GERU.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::geru(main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(),
+                                  lda, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(
+            main_queue, onemkl::blas::geru,
+            (main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), lda, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during GERU:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, m, n, lda, std::max<int>(m, n), std::cout);
+
+    return (int)good;
+}
+
+class GeruUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GeruUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+}
+TEST_P(GeruUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), 25, 30, alpha, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(GeruUsmTestSuite, GeruUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hbmv.cpp b/tests/unit_tests/blas/level2/hbmv.cpp
index 2882f2c26..7d4f45734 100644
--- a/tests/unit_tests/blas/level2/hbmv.cpp
+++ b/tests/unit_tests/blas/level2/hbmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx,
-          int incy, int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx,
+         int incy, int lda) {
     // Prepare data.
 
     vector<fp> x, y, y_ref, A;
@@ -101,6 +101,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -108,7 +116,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f
         good            = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class HbmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -116,33 +124,33 @@ class HbmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(HbmvTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
 }
 TEST_P(HbmvTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2,
-                                           -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2,
-                                           -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha,
+                                                 beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha,
+                                                 beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
 }
 
diff --git a/tests/unit_tests/blas/level2/hbmv_usm.cpp b/tests/unit_tests/blas/level2/hbmv_usm.cpp
new file mode 100644
index 000000000..2b00a4515
--- /dev/null
+++ b/tests/unit_tests/blas/level2/hbmv_usm.cpp
@@ -0,0 +1,159 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx,
+         int incy, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during HBMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto y_ref = y;
+
+    // Call Reference HBMV.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    const int k_ref = k;
+    using fp_ref    = typename ref_type_info<fp>::type;
+
+    ::hbmv(convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, (fp_ref *)&alpha, (fp_ref *)A.data(),
+           &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(),
+           &incy_ref);
+
+    // Call DPC++ HBMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::hbmv(main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(),
+                                  incx, beta, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::hbmv,
+                    (main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(), incx, beta,
+                     y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during HBMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
+
+    return (int)good;
+}
+
+class HbmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(HbmvUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
+}
+TEST_P(HbmvUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha,
+                                                 beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha,
+                                                 beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(HbmvUsmTestSuite, HbmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hemv.cpp b/tests/unit_tests/blas/level2/hemv.cpp
index ef3c0bbc8..848a0e1ef 100644
--- a/tests/unit_tests/blas/level2/hemv.cpp
+++ b/tests/unit_tests/blas/level2/hemv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy,
-          int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy,
+         int lda) {
     // Prepare data.
     vector<fp> x, y, y_ref, A;
 
@@ -99,6 +99,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -106,7 +114,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
         good            = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class HemvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -114,33 +122,33 @@ class HemvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(HemvTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
 }
 TEST_P(HemvTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
 }
 
diff --git a/tests/unit_tests/blas/level2/hemv_usm.cpp b/tests/unit_tests/blas/level2/hemv_usm.cpp
new file mode 100644
index 000000000..ab55ceca2
--- /dev/null
+++ b/tests/unit_tests/blas/level2/hemv_usm.cpp
@@ -0,0 +1,158 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy,
+         int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during HEMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto y_ref = y;
+
+    // Call Reference HEMV.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::hemv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(),
+           &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(),
+           &incy_ref);
+
+    // Call DPC++ HEMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::hemv(main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx,
+                                  beta, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::hemv,
+                    (main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx, beta,
+                     y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during HEMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
+
+    return (int)good;
+}
+
+class HemvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(HemvUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
+}
+TEST_P(HemvUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(HemvUsmTestSuite, HemvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/her.cpp b/tests/unit_tests/blas/level2/her.cpp
index 60686797f..e5d79586f 100644
--- a/tests/unit_tests/blas/level2/her.cpp
+++ b/tests/unit_tests/blas/level2/her.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_scalar>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) {
     // Prepare data.
     vector<fp> x, A_ref, A;
     rand_vector(x, n, incx);
@@ -93,6 +93,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -100,39 +108,39 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i
         good            = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class HerTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(HerTests, ComplexSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)));
 }
 TEST_P(HerTests, ComplexDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)));
 }
 
diff --git a/tests/unit_tests/blas/level2/her2.cpp b/tests/unit_tests/blas/level2/her2.cpp
index 4b3d2c1de..dc2a20781 100644
--- a/tests/unit_tests/blas/level2/her2.cpp
+++ b/tests/unit_tests/blas/level2/her2.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy,
-          int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy,
+         int lda) {
     // Prepare data.
     vector<fp> x, y, A_ref, A;
 
@@ -98,6 +98,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -105,28 +113,40 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
         good            = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class Her2Tests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(Her2Tests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
 }
 TEST_P(Her2Tests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(Her2TestSuite, Her2Tests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/her2_usm.cpp b/tests/unit_tests/blas/level2/her2_usm.cpp
new file mode 100644
index 000000000..bb5dc6107
--- /dev/null
+++ b/tests/unit_tests/blas/level2/her2_usm.cpp
@@ -0,0 +1,155 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy,
+         int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during HER2:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto A_ref = A;
+
+    // Call Reference HER2.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::her2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
+           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
+
+    // Call DPC++ HER2.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::her2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy,
+                                  A.data(), lda, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::her2,
+                    (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(),
+                     lda, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during HER2:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout);
+
+    return (int)good;
+}
+
+class Her2UsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(Her2UsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+}
+TEST_P(Her2UsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(Her2UsmTestSuite, Her2UsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/her_usm.cpp b/tests/unit_tests/blas/level2/her_usm.cpp
new file mode 100644
index 000000000..e99cb9273
--- /dev/null
+++ b/tests/unit_tests/blas/level2/her_usm.cpp
@@ -0,0 +1,153 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_scalar>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during HER:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto A_ref = A;
+
+    // Call Reference HER.
+    const int n_ref = n, incx_ref = incx, lda_ref = lda;
+    using fp_ref        = typename ref_type_info<fp>::type;
+    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
+
+    ::her(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(),
+          &incx_ref, (fp_ref *)A_ref.data(), &lda_ref);
+
+    // Call DPC++ HER.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::her(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda,
+                                 dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(
+            main_queue, onemkl::blas::her,
+            (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during HER:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout);
+
+    return (int)good;
+}
+
+class HerUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(HerUsmTests, ComplexSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)));
+}
+TEST_P(HerUsmTests, ComplexDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)));
+}
+
+INSTANTIATE_TEST_SUITE_P(HerUsmTestSuite, HerUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpmv.cpp b/tests/unit_tests/blas/level2/hpmv.cpp
index 64e89724e..4df20c625 100644
--- a/tests/unit_tests/blas/level2/hpmv.cpp
+++ b/tests/unit_tests/blas/level2/hpmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx,
-          int incy) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx,
+         int incy) {
     // Prepare data.
     vector<fp> x, y, y_ref, A;
     rand_vector(x, n, incx);
@@ -97,6 +97,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -104,7 +112,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
         good            = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class HpmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -112,26 +120,34 @@ class HpmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(HpmvTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
 }
 TEST_P(HpmvTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
 }
 
 INSTANTIATE_TEST_SUITE_P(HpmvTestSuite, HpmvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/hpmv_usm.cpp b/tests/unit_tests/blas/level2/hpmv_usm.cpp
new file mode 100644
index 000000000..1e42d96d6
--- /dev/null
+++ b/tests/unit_tests/blas/level2/hpmv_usm.cpp
@@ -0,0 +1,156 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx,
+         int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during HPMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, n);
+
+    auto y_ref = y;
+
+    // Call Reference HPMV.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::hpmv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(),
+           (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
+
+    // Call DPC++ HPMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::hpmv(main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta,
+                                  y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::hpmv,
+                    (main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta, y.data(),
+                     incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during HPMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
+
+    return (int)good;
+}
+
+class HpmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(HpmvUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+}
+TEST_P(HpmvUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+}
+
+INSTANTIATE_TEST_SUITE_P(HpmvUsmTestSuite, HpmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpr.cpp b/tests/unit_tests/blas/level2/hpr.cpp
index d16d9ccee..45098543e 100644
--- a/tests/unit_tests/blas/level2/hpr.cpp
+++ b/tests/unit_tests/blas/level2/hpr.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_scalar>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx) {
     // Prepare data.
     vector<fp> x, A_ref, A;
     rand_vector(x, n, incx);
@@ -93,6 +93,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -100,34 +108,40 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i
         good            = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class HprTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(HprTests, ComplexSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1)));
 }
 
 TEST_P(HprTests, ComplexDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1)));
-    EXPECT_TRUE(
+    EXPECT_TRUEORSKIP(
         (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1)));
 }
 
diff --git a/tests/unit_tests/blas/level2/hpr2.cpp b/tests/unit_tests/blas/level2/hpr2.cpp
index 754407bb6..9aab3cc6c 100644
--- a/tests/unit_tests/blas/level2/hpr2.cpp
+++ b/tests/unit_tests/blas/level2/hpr2.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) {
     // Prepare data.
     vector<fp> x, y, A_ref, A;
     rand_vector(x, n, incx);
@@ -95,6 +95,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -102,28 +110,32 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
         good            = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class Hpr2Tests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(Hpr2Tests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
 }
 TEST_P(Hpr2Tests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
 }
 
 INSTANTIATE_TEST_SUITE_P(Hpr2TestSuite, Hpr2Tests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/hpr2_usm.cpp b/tests/unit_tests/blas/level2/hpr2_usm.cpp
new file mode 100644
index 000000000..4b8b65fe5
--- /dev/null
+++ b/tests/unit_tests/blas/level2/hpr2_usm.cpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during HPR2:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, n);
+
+    auto A_ref = A;
+
+    // Call Reference HPR2.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::hpr2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
+           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data());
+
+    // Call DPC++ HPR2.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::hpr2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy,
+                                  A.data(), dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::hpr2,
+                    (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(),
+                     dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during HPR2:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout);
+
+    return (int)good;
+}
+
+class Hpr2UsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(Hpr2UsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+}
+TEST_P(Hpr2UsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(
+        test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+}
+
+INSTANTIATE_TEST_SUITE_P(Hpr2UsmTestSuite, Hpr2UsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpr_usm.cpp b/tests/unit_tests/blas/level2/hpr_usm.cpp
new file mode 100644
index 000000000..71c3b3d74
--- /dev/null
+++ b/tests/unit_tests/blas/level2/hpr_usm.cpp
@@ -0,0 +1,153 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_scalar>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during HPR:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, n);
+
+    auto A_ref = A;
+
+    // Call Reference HPR.
+    const int n_ref = n, incx_ref = incx;
+    using fp_ref        = typename ref_type_info<fp>::type;
+    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
+
+    ::hpr(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(),
+          &incx_ref, (fp_ref *)A_ref.data());
+
+    // Call DPC++ HPR.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::hpr(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(),
+                                 dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::hpr,
+                    (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during HPR:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout);
+
+    return (int)good;
+}
+
+class HprUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(HprUsmTests, ComplexSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1)));
+}
+
+TEST_P(HprUsmTests, ComplexDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1)));
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1)));
+}
+
+INSTANTIATE_TEST_SUITE_P(HprUsmTestSuite, HprUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/sbmv.cpp b/tests/unit_tests/blas/level2/sbmv.cpp
index d79e555f1..fdcf0da4c 100644
--- a/tests/unit_tests/blas/level2/sbmv.cpp
+++ b/tests/unit_tests/blas/level2/sbmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx,
-          int incy, int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx,
+         int incy, int lda) {
     // Prepare data.
     vector<fp> x, y, y_ref, A;
     rand_vector(x, n, incx);
@@ -98,6 +98,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -105,7 +113,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f
         good            = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SbmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -113,22 +121,24 @@ class SbmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(SbmvTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
 }
 TEST_P(SbmvTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(SbmvTestSuite, SbmvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/sbmv_usm.cpp b/tests/unit_tests/blas/level2/sbmv_usm.cpp
new file mode 100644
index 000000000..63921f870
--- /dev/null
+++ b/tests/unit_tests/blas/level2/sbmv_usm.cpp
@@ -0,0 +1,148 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx,
+         int incy, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SBMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto y_ref = y;
+
+    // Call Reference SBMV.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    const int k_ref = k;
+    using fp_ref    = typename ref_type_info<fp>::type;
+
+    ::sbmv(convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, (fp_ref *)&alpha, (fp_ref *)A.data(),
+           &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(),
+           &incy_ref);
+
+    // Call DPC++ SBMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::sbmv(main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(),
+                                  incx, beta, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::sbmv,
+                    (main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(), incx, beta,
+                     y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SBMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
+
+    return (int)good;
+}
+
+class SbmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SbmvUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
+}
+TEST_P(SbmvUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(
+        test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(SbmvUsmTestSuite, SbmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spmv.cpp b/tests/unit_tests/blas/level2/spmv.cpp
index 4f1de876b..6d2831ce3 100644
--- a/tests/unit_tests/blas/level2/spmv.cpp
+++ b/tests/unit_tests/blas/level2/spmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx,
-          int incy) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx,
+         int incy) {
     // Prepare data.
     vector<fp> x, y, y_ref, A;
     rand_vector(x, n, incx);
@@ -97,6 +97,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -104,7 +112,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
         good            = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SpmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -112,22 +120,22 @@ class SpmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(SpmvTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
 }
 TEST_P(SpmvTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
 }
 
 INSTANTIATE_TEST_SUITE_P(SpmvTestSuite, SpmvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/spmv_usm.cpp b/tests/unit_tests/blas/level2/spmv_usm.cpp
new file mode 100644
index 000000000..95866fead
--- /dev/null
+++ b/tests/unit_tests/blas/level2/spmv_usm.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx,
+         int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SPMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, n);
+
+    auto y_ref = y;
+
+    // Call Reference SPMV.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::spmv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(),
+           (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
+
+    // Call DPC++ SPMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::spmv(main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta,
+                                  y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::spmv,
+                    (main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta, y.data(),
+                     incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SPMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
+
+    return (int)good;
+}
+
+class SpmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SpmvUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+}
+TEST_P(SpmvUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1));
+}
+
+INSTANTIATE_TEST_SUITE_P(SpmvUsmTestSuite, SpmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spr.cpp b/tests/unit_tests/blas/level2/spr.cpp
index b9c457898..522460211 100644
--- a/tests/unit_tests/blas/level2/spr.cpp
+++ b/tests/unit_tests/blas/level2/spr.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx) {
     // Prepare data.
     vector<fp> x, A_ref, A;
     rand_vector(x, n, incx);
@@ -92,6 +92,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -99,28 +107,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
         good            = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SprTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(SprTests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1));
 }
 TEST_P(SprTests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1));
 }
 
 INSTANTIATE_TEST_SUITE_P(SprTestSuite, SprTests, ::testing::ValuesIn(devices), ::DeviceNamePrint());
diff --git a/tests/unit_tests/blas/level2/spr2.cpp b/tests/unit_tests/blas/level2/spr2.cpp
index ccd2977b0..e3ce90ad7 100644
--- a/tests/unit_tests/blas/level2/spr2.cpp
+++ b/tests/unit_tests/blas/level2/spr2.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) {
     // Prepare data.
     vector<fp> x, y, A_ref, A;
     rand_vector(x, n, incx);
@@ -95,6 +95,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -102,28 +110,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
         good            = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class Spr2Tests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(Spr2Tests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
 }
 TEST_P(Spr2Tests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
 }
 
 INSTANTIATE_TEST_SUITE_P(Spr2TestSuite, Spr2Tests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/spr2_usm.cpp b/tests/unit_tests/blas/level2/spr2_usm.cpp
new file mode 100644
index 000000000..b07fb7e1b
--- /dev/null
+++ b/tests/unit_tests/blas/level2/spr2_usm.cpp
@@ -0,0 +1,141 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SPR2:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, n);
+
+    auto A_ref = A;
+
+    // Call Reference SPR2.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::spr2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
+           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data());
+
+    // Call DPC++ SPR2.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::spr2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy,
+                                  A.data(), dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::spr2,
+                    (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(),
+                     dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SPR2:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout);
+
+    return (int)good;
+}
+
+class Spr2UsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(Spr2UsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+}
+TEST_P(Spr2UsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1));
+}
+
+INSTANTIATE_TEST_SUITE_P(Spr2UsmTestSuite, Spr2UsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spr_usm.cpp b/tests/unit_tests/blas/level2/spr_usm.cpp
new file mode 100644
index 000000000..09c958d6a
--- /dev/null
+++ b/tests/unit_tests/blas/level2/spr_usm.cpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SPR:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, n);
+
+    auto A_ref = A;
+
+    // Call Reference SPR.
+    const int n_ref = n, incx_ref = incx;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::spr(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
+          &incx_ref, (fp_ref *)A_ref.data());
+
+    // Call DPC++ SPR.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::spr(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(),
+                                 dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::spr,
+                    (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SPR:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout);
+
+    return (int)good;
+}
+
+class SprUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SprUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1));
+}
+TEST_P(SprUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1));
+}
+
+INSTANTIATE_TEST_SUITE_P(SprUsmTestSuite, SprUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/symv.cpp b/tests/unit_tests/blas/level2/symv.cpp
index a8d154f6b..4152be934 100644
--- a/tests/unit_tests/blas/level2/symv.cpp
+++ b/tests/unit_tests/blas/level2/symv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy,
-          int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy,
+         int lda) {
     // Prepare data.
     vector<fp> x, y, y_ref, A;
     rand_vector(x, n, incx);
@@ -98,6 +98,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -105,7 +113,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta,
         good            = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SymvTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -113,22 +121,22 @@ class SymvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(SymvTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
 }
 TEST_P(SymvTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(SymvTestSuite, SymvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/symv_usm.cpp b/tests/unit_tests/blas/level2/symv_usm.cpp
new file mode 100644
index 000000000..b4569db53
--- /dev/null
+++ b/tests/unit_tests/blas/level2/symv_usm.cpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy,
+         int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SYMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto y_ref = y;
+
+    // Call Reference SYMV.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::symv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(),
+           &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(),
+           &incy_ref);
+
+    // Call DPC++ SYMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::symv(main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx,
+                                  beta, y.data(), incy, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::symv,
+                    (main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx, beta,
+                     y.data(), incy, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SYMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
+
+    return (int)good;
+}
+
+class SymvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SymvUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
+}
+TEST_P(SymvUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(SymvUsmTestSuite, SymvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/syr.cpp b/tests/unit_tests/blas/level2/syr.cpp
index 8d7618f9e..9bc5947f9 100644
--- a/tests/unit_tests/blas/level2/syr.cpp
+++ b/tests/unit_tests/blas/level2/syr.cpp
@@ -43,7 +43,7 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int lda) {
     // Prepare data.
     vector<fp> x, A_ref, A;
     rand_vector(x, n, incx);
@@ -92,6 +92,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -99,28 +107,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
         good            = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SyrTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(SyrTests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42));
 }
 TEST_P(SyrTests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(SyrTestSuite, SyrTests, ::testing::ValuesIn(devices), ::DeviceNamePrint());
diff --git a/tests/unit_tests/blas/level2/syr2.cpp b/tests/unit_tests/blas/level2/syr2.cpp
index 3b704fb22..e0eba5a15 100644
--- a/tests/unit_tests/blas/level2/syr2.cpp
+++ b/tests/unit_tests/blas/level2/syr2.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy,
-          int lda) {
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy,
+         int lda) {
     // Prepare data.
     vector<fp> x, y, A_ref, A;
     rand_vector(x, n, incx);
@@ -97,6 +97,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -104,28 +112,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx
         good            = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class Syr2Tests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(Syr2Tests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
 }
 TEST_P(Syr2Tests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(Syr2TestSuite, Syr2Tests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/syr2_usm.cpp b/tests/unit_tests/blas/level2/syr2_usm.cpp
new file mode 100644
index 000000000..a356be237
--- /dev/null
+++ b/tests/unit_tests/blas/level2/syr2_usm.cpp
@@ -0,0 +1,142 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy,
+         int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SYR2:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_vector(y, n, incy);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto A_ref = A;
+
+    // Call Reference SYR2.
+    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::syr2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
+           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
+
+    // Call DPC++ SYR2.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::syr2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy,
+                                  A.data(), lda, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::syr2,
+                    (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(),
+                     lda, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SYR2:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout);
+
+    return (int)good;
+}
+
+class Syr2UsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(Syr2UsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+}
+TEST_P(Syr2UsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(Syr2UsmTestSuite, Syr2UsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/syr_usm.cpp b/tests/unit_tests/blas/level2/syr_usm.cpp
new file mode 100644
index 000000000..c32347f69
--- /dev/null
+++ b/tests/unit_tests/blas/level2/syr_usm.cpp
@@ -0,0 +1,140 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception during SYR:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+
+    auto A_ref = A;
+
+    // Call Reference SYR.
+    const int n_ref = n, incx_ref = incx, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::syr(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
+          &incx_ref, (fp_ref *)A_ref.data(), &lda_ref);
+
+    // Call DPC++ SYR.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::syr(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda,
+                                 dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(
+            main_queue, onemkl::blas::syr,
+            (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const &e) {
+        std::cout << "Caught synchronous SYCL exception during SYR:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception &e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout);
+
+    return (int)good;
+}
+
+class SyrUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SyrUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42));
+}
+TEST_P(SyrUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(SyrUsmTestSuite, SyrUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tbmv.cpp b/tests/unit_tests/blas/level2/tbmv.cpp
index 89145cf5a..68b8f6e28 100644
--- a/tests/unit_tests/blas/level2/tbmv.cpp
+++ b/tests/unit_tests/blas/level2/tbmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
-          onemkl::diag unit_nonunit, int n, int k, int incx, int lda) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int k, int incx, int lda) {
     // Prepare data.
     vector<fp> x, x_ref, A;
     rand_vector(x, n, incx);
@@ -97,6 +97,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TBMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -104,118 +112,122 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
         good            = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TbmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TbmvTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
 }
 TEST_P(TbmvTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
 }
 TEST_P(TbmvTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
 }
 TEST_P(TbmvTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(TbmvTestSuite, TbmvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/tbmv_usm.cpp b/tests/unit_tests/blas/level2/tbmv_usm.cpp
new file mode 100644
index 000000000..c38ba20a9
--- /dev/null
+++ b/tests/unit_tests/blas/level2/tbmv_usm.cpp
@@ -0,0 +1,237 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int k, int incx, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TBMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_matrix(A, transa, n, n, lda);
+
+    auto x_ref = x;
+
+    // Call Reference TBMV.
+    const int n_ref = n, incx_ref = incx, lda_ref = lda;
+    const int k_ref = k;
+    using fp_ref    = typename ref_type_info<fp>::type;
+
+    ::tbmv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
+           convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref, (fp_ref*)A.data(), &lda_ref,
+           (fp_ref*)x_ref.data(), &incx_ref);
+
+    // Call DPC++ TBMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::tbmv(main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(),
+                                  lda, x.data(), incx, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::tbmv,
+                    (main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(), lda, x.data(),
+                     incx, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TBMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TBMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout);
+
+    return (int)good;
+}
+
+class TbmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TbmvUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+}
+TEST_P(TbmvUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+}
+TEST_P(TbmvUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+}
+TEST_P(TbmvUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(TbmvUsmTestSuite, TbmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tbsv.cpp b/tests/unit_tests/blas/level2/tbsv.cpp
index 1a09c8dbf..9f3a3b68f 100644
--- a/tests/unit_tests/blas/level2/tbsv.cpp
+++ b/tests/unit_tests/blas/level2/tbsv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
-          onemkl::diag unit_nonunit, int n, int k, int incx, int lda) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int k, int incx, int lda) {
     // Prepare data.
     vector<fp> x, x_ref, A;
     rand_vector(x, n, incx);
@@ -97,6 +97,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TBSV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -104,118 +112,122 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
         good            = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TbsvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TbsvTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
 }
 TEST_P(TbsvTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
 }
 TEST_P(TbsvTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          5, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
 }
 TEST_P(TbsvTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(TbsvTestSuite, TbsvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/tbsv_usm.cpp b/tests/unit_tests/blas/level2/tbsv_usm.cpp
new file mode 100644
index 000000000..b01a03ee8
--- /dev/null
+++ b/tests/unit_tests/blas/level2/tbsv_usm.cpp
@@ -0,0 +1,237 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int k, int incx, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TBSV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_trsm_matrix(A, transa, n, n, lda);
+
+    auto x_ref = x;
+
+    // Call Reference TBSV.
+    const int n_ref = n, incx_ref = incx, lda_ref = lda;
+    const int k_ref = k;
+    using fp_ref    = typename ref_type_info<fp>::type;
+
+    ::tbsv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
+           convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref, (fp_ref*)A.data(), &lda_ref,
+           (fp_ref*)x_ref.data(), &incx_ref);
+
+    // Call DPC++ TBSV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::tbsv(main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(),
+                                  lda, x.data(), incx, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::tbsv,
+                    (main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(), lda, x.data(),
+                     incx, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TBSV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TBSV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout);
+
+    return (int)good;
+}
+
+class TbsvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TbsvUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 5, 2, 42));
+}
+TEST_P(TbsvUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 5, 2, 42));
+}
+TEST_P(TbsvUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 30, 5,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 5, 2, 42));
+}
+TEST_P(TbsvUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::unit, 30,
+                                                 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 5, 2, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(TbsvUsmTestSuite, TbsvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tpmv.cpp b/tests/unit_tests/blas/level2/tpmv.cpp
index 678c837b3..c85dacf0d 100644
--- a/tests/unit_tests/blas/level2/tpmv.cpp
+++ b/tests/unit_tests/blas/level2/tpmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
-          onemkl::diag unit_nonunit, int n, int incx) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx) {
     // Prepare data.
     vector<fp> x, x_ref, A;
     rand_vector(x, n, incx);
@@ -95,6 +95,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TPMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -102,106 +110,106 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
         good            = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TpmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TpmvTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
 }
 TEST_P(TpmvTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
 }
 TEST_P(TpmvTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                          2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                          2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
 }
 TEST_P(TpmvTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
 }
 
 INSTANTIATE_TEST_SUITE_P(TpmvTestSuite, TpmvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/tpmv_usm.cpp b/tests/unit_tests/blas/level2/tpmv_usm.cpp
new file mode 100644
index 000000000..e208d28f2
--- /dev/null
+++ b/tests/unit_tests/blas/level2/tpmv_usm.cpp
@@ -0,0 +1,220 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TPMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_matrix(A, transa, n, n, n);
+
+    auto x_ref = x;
+
+    // Call Reference TPMV.
+    const int n_ref = n, incx_ref = incx;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::tpmv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
+           convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), (fp_ref*)x_ref.data(),
+           &incx_ref);
+
+    // Call DPC++ TPMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::tpmv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(),
+                                  x.data(), incx, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::tpmv,
+                    (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), x.data(), incx,
+                     dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TBMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TPMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout);
+
+    return (int)good;
+}
+
+class TpmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TpmvUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
+}
+TEST_P(TpmvUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
+}
+TEST_P(TpmvUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
+}
+TEST_P(TpmvUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
+}
+
+INSTANTIATE_TEST_SUITE_P(TpmvUsmTestSuite, TpmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tpsv.cpp b/tests/unit_tests/blas/level2/tpsv.cpp
index 4e18af350..201b53471 100644
--- a/tests/unit_tests/blas/level2/tpsv.cpp
+++ b/tests/unit_tests/blas/level2/tpsv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
-          onemkl::diag unit_nonunit, int n, int incx) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx) {
     // Prepare data.
     vector<fp> x, x_ref, A;
     rand_vector(x, n, incx);
@@ -95,6 +95,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TPSV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -102,106 +110,106 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
         good            = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TpsvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TpsvTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
 }
 TEST_P(TpsvTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
 }
 TEST_P(TpsvTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                          2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                          2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
 }
 TEST_P(TpsvTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
 }
 
 INSTANTIATE_TEST_SUITE_P(TpsvTestSuite, TpsvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/tpsv_usm.cpp b/tests/unit_tests/blas/level2/tpsv_usm.cpp
new file mode 100644
index 000000000..cb218dc99
--- /dev/null
+++ b/tests/unit_tests/blas/level2/tpsv_usm.cpp
@@ -0,0 +1,220 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TPSV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_trsm_matrix(A, transa, n, n, n);
+
+    auto x_ref = x;
+
+    // Call Reference TPSV.
+    const int n_ref = n, incx_ref = incx;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::tpsv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
+           convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), (fp_ref*)x_ref.data(),
+           &incx_ref);
+
+    // Call DPC++ TPSV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::tpsv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(),
+                                  x.data(), incx, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::tpsv,
+                    (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), x.data(), incx,
+                     dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TPSV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TPSV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout);
+
+    return (int)good;
+}
+
+class TpsvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TpsvUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2));
+}
+TEST_P(TpsvUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2));
+}
+TEST_P(TpsvUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2));
+}
+TEST_P(TpsvUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2));
+}
+
+INSTANTIATE_TEST_SUITE_P(TpsvUsmTestSuite, TpsvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/trmv.cpp b/tests/unit_tests/blas/level2/trmv.cpp
index e64b14155..5678da343 100644
--- a/tests/unit_tests/blas/level2/trmv.cpp
+++ b/tests/unit_tests/blas/level2/trmv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
-          onemkl::diag unit_nonunit, int n, int incx, int lda) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx, int lda) {
     // Prepare data.
     vector<fp> x, x_ref, A;
     rand_vector(x, n, incx);
@@ -96,6 +96,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRMV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -103,116 +111,118 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
         good            = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TrmvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TrmvTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
 }
 TEST_P(TrmvTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
 }
 TEST_P(TrmvTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
 }
 TEST_P(TrmvTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(TrmvTestSuite, TrmvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/trmv_usm.cpp b/tests/unit_tests/blas/level2/trmv_usm.cpp
new file mode 100644
index 000000000..909536b56
--- /dev/null
+++ b/tests/unit_tests/blas/level2/trmv_usm.cpp
@@ -0,0 +1,232 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TRMV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_matrix(A, transa, n, n, lda);
+
+    auto x_ref = x;
+
+    // Call Reference TRMV.
+    const int n_ref = n, incx_ref = incx, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::trmv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
+           convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), &lda_ref,
+           (fp_ref*)x_ref.data(), &incx_ref);
+
+    // Call DPC++ TRMV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::trmv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda,
+                                  x.data(), incx, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::trmv,
+                    (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda, x.data(),
+                     incx, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TRMV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRMV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout);
+
+    return (int)good;
+}
+
+class TrmvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TrmvUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+}
+TEST_P(TrmvUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+}
+TEST_P(TrmvUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+}
+TEST_P(TrmvUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(TrmvUsmTestSuite, TrmvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/trsv.cpp b/tests/unit_tests/blas/level2/trsv.cpp
index 42daa93d9..1982bd5f4 100644
--- a/tests/unit_tests/blas/level2/trsv.cpp
+++ b/tests/unit_tests/blas/level2/trsv.cpp
@@ -43,8 +43,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
-          onemkl::diag unit_nonunit, int n, int incx, int lda) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx, int lda) {
     // Prepare data.
     vector<fp> x, x_ref, A;
     rand_vector(x, n, incx);
@@ -96,6 +96,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRSV:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -103,116 +111,118 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
         good            = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TrsvTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TrsvTests, RealSinglePrecision) {
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                            onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                            onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
 }
 TEST_P(TrsvTests, RealDoublePrecision) {
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
-                             onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                             onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
 }
 TEST_P(TrsvTests, ComplexSinglePrecision) {
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2,
-                                          42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          onemkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2, 42));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                          2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
 }
 TEST_P(TrsvTests, ComplexDoublePrecision) {
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2,
-                                           42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30,
-                                           2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
 }
 
 INSTANTIATE_TEST_SUITE_P(TrsvTestSuite, TrsvTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level2/trsv_usm.cpp b/tests/unit_tests/blas/level2/trsv_usm.cpp
new file mode 100644
index 000000000..1c76f541d
--- /dev/null
+++ b/tests/unit_tests/blas/level2/trsv_usm.cpp
@@ -0,0 +1,232 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa,
+         onemkl::diag unit_nonunit, int n, int incx, int lda) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TRSV:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> x(ua), A(ua);
+    rand_vector(x, n, incx);
+    rand_trsm_matrix(A, transa, n, n, lda);
+
+    auto x_ref = x;
+
+    // Call Reference TRSV.
+    const int n_ref = n, incx_ref = incx, lda_ref = lda;
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::trsv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
+           convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), &lda_ref,
+           (fp_ref*)x_ref.data(), &incx_ref);
+
+    // Call DPC++ TRSV.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::trsv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda,
+                                  x.data(), incx, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::trsv,
+                    (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda, x.data(),
+                     incx, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TRSV:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRSV:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout);
+
+    return (int)good;
+}
+
+class TrsvUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TrsvUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                  onemkl::diag::nonunit, 30, 2, 42));
+}
+TEST_P(TrsvUsmTests, RealDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
+                                   onemkl::diag::nonunit, 30, 2, 42));
+}
+TEST_P(TrsvUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 30,
+                                                2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                30, 2, 42));
+}
+TEST_P(TrsvUsmTests, ComplexDoublePrecision) {
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, onemkl::diag::nonunit,
+                                                 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 30, 2, 42));
+}
+
+INSTANTIATE_TEST_SUITE_P(TrsvUsmTestSuite, TrsvUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/CMakeLists.txt b/tests/unit_tests/blas/level3/CMakeLists.txt
index 6845dd95f..a00b64328 100644
--- a/tests/unit_tests/blas/level3/CMakeLists.txt
+++ b/tests/unit_tests/blas/level3/CMakeLists.txt
@@ -18,7 +18,7 @@
 #===============================================================================
 
 # Build object from all test sources
-set(L3_SOURCES "gemm.cpp" "symm.cpp" "syrk.cpp" "hemm.cpp" "herk.cpp" "syr2k.cpp" "her2k.cpp" "trmm.cpp" "trsm.cpp")
+set(L3_SOURCES "gemm.cpp" "symm.cpp" "syrk.cpp" "hemm.cpp" "herk.cpp" "syr2k.cpp" "her2k.cpp" "trmm.cpp" "trsm.cpp" "gemm_usm.cpp" "symm_usm.cpp" "syrk_usm.cpp" "hemm_usm.cpp" "herk_usm.cpp" "syr2k_usm.cpp" "her2k_usm.cpp" "trmm_usm.cpp" "trsm_usm.cpp")
 
 if(BUILD_SHARED_LIBS)
   add_library(blas_level3_rt OBJECT ${L3_SOURCES})
diff --git a/tests/unit_tests/blas/level3/gemm.cpp b/tests/unit_tests/blas/level3/gemm.cpp
index 0693b33f5..51124cf4c 100644
--- a/tests/unit_tests/blas/level3/gemm.cpp
+++ b/tests/unit_tests/blas/level3/gemm.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n,
-          int k, int lda, int ldb, int ldc, fp alpha, fp beta) {
+int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, int k,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
     rand_matrix(A, transa, m, k, lda);
@@ -101,11 +101,19 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of GEMM:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     auto C_accessor = C_buffer.template get_access<access::mode::read>();
     bool good       = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * k, std::cout);
 
-    return good;
+    return (int)good;
 }
 
 class GemmTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -113,91 +121,95 @@ class GemmTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(GemmTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans,
-                            79, 83, 91, 103, 105, 106, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79,
-                            83, 91, 103, 105, 106, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79,
-                            83, 91, 103, 105, 106, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83,
-                            91, 103, 105, 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha,
+                                  beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans,
+                                  79, 83, 91, 103, 105, 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans,
+                                  79, 83, 91, 103, 105, 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans,
+                                  79, 83, 91, 103, 105, 106, alpha, beta));
 }
 
 TEST_P(GemmTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans,
-                             79, 83, 91, 103, 105, 106, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79,
-                             83, 91, 103, 105, 106, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79,
-                             83, 91, 103, 105, 106, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83,
-                             91, 103, 105, 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::nontrans,
+                                   onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::trans,
+                                   onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans,
+                                   79, 83, 91, 103, 105, 106, alpha, beta));
 }
 
 TEST_P(GemmTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
-                                          onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
-                                          onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
-                                          onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
-                                          onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
-                                          onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
-                                          onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
-                                          onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
-                                          onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
-                                          onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106,
-                                          alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
+                                                onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
+                                                onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
+                                                onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
+                                                onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
+                                                onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
+                                                onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
+                                                onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
+                                                onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
+                                                onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
 }
 
 TEST_P(GemmTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
-                                           onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
-                                           onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
-                                           onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
-                                           onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
-                                           onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
-                                           onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
-                                           onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
-                                           onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
-                                           onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106,
-                                           alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
+                                                 onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
+                                                 onemkl::transpose::trans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
+                                                 onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
+                                                 onemkl::transpose::trans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
+                                                 onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
+                                                 onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
+                                                 onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
+                                                 onemkl::transpose::trans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
+                                                 onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
 }
 
 INSTANTIATE_TEST_SUITE_P(GemmTestSuite, GemmTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/gemm_usm.cpp b/tests/unit_tests/blas/level3/gemm_usm.cpp
new file mode 100644
index 000000000..6753fe1f3
--- /dev/null
+++ b/tests/unit_tests/blas/level3/gemm_usm.cpp
@@ -0,0 +1,220 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, int k,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during GEMM:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
+    rand_matrix(A, transa, m, k, lda);
+    rand_matrix(B, transb, k, n, ldb);
+    rand_matrix(C, onemkl::transpose::nontrans, m, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference GEMM.
+    const int m_ref = m, n_ref = n, k_ref = k;
+    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::gemm(convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), &m_ref, &n_ref, &k_ref,
+           (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref,
+           (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
+
+    // Call DPC++ GEMM.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::gemm(main_queue, transa, transb, m, n, k, alpha, A.data(), lda,
+                                  B.data(), ldb, beta, C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::gemm,
+                    (main_queue, transa, transb, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta,
+                     C.data(), ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during GEMM:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of GEMM:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(C, C_ref, m, n, ldc, 10 * k, std::cout);
+
+    return (int)good;
+}
+
+class GemmUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(GemmUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::nontrans,
+                                  onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha,
+                                  beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans,
+                                  79, 83, 91, 103, 105, 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans,
+                                  79, 83, 91, 103, 105, 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans,
+                                  79, 83, 91, 103, 105, 106, alpha, beta));
+}
+
+TEST_P(GemmUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::nontrans,
+                                   onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::nontrans,
+                                   onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::trans,
+                                   onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha,
+                                   beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans,
+                                   79, 83, 91, 103, 105, 106, alpha, beta));
+}
+
+TEST_P(GemmUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
+                                                onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
+                                                onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
+                                                onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
+                                                onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::nontrans,
+                                                onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::trans,
+                                                onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
+                                                onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
+                                                onemkl::transpose::trans, 79, 83, 91, 103, 105, 106,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::transpose::conjtrans,
+                                                onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                106, alpha, beta));
+}
+
+TEST_P(GemmUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
+                                                 onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
+                                                 onemkl::transpose::trans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
+                                                 onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
+                                                 onemkl::transpose::trans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::nontrans,
+                                                 onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::trans,
+                                                 onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
+                                                 onemkl::transpose::nontrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
+                                                 onemkl::transpose::trans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::transpose::conjtrans,
+                                                 onemkl::transpose::conjtrans, 79, 83, 91, 103, 105,
+                                                 106, alpha, beta));
+}
+
+INSTANTIATE_TEST_SUITE_P(GemmUsmTestSuite, GemmUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/hemm.cpp b/tests/unit_tests/blas/level3/hemm.cpp
index 4359d9758..15b25ee4a 100644
--- a/tests/unit_tests/blas/level3/hemm.cpp
+++ b/tests/unit_tests/blas/level3/hemm.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n,
-          int lda, int ldb, int ldc, fp alpha, fp beta) {
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
     if (left_right == onemkl::side::left)
@@ -104,13 +104,21 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of HEMM:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
         auto C_accessor = C_buffer.template get_access<access::mode::read>();
         good = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout);
     }
-    return good;
+    return (int)good;
 }
 
 class HemmTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -118,26 +126,26 @@ class HemmTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(HemmTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72,
-                                          27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72,
-                                          27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72,
-                                          27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72,
-                                          27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
 }
 TEST_P(HemmTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72,
-                                           27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72,
-                                           27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72,
-                                           27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72,
-                                           27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
 }
 
 INSTANTIATE_TEST_SUITE_P(HemmTestSuite, HemmTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/hemm_usm.cpp b/tests/unit_tests/blas/level3/hemm_usm.cpp
new file mode 100644
index 000000000..39dece022
--- /dev/null
+++ b/tests/unit_tests/blas/level3/hemm_usm.cpp
@@ -0,0 +1,154 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during HEMM:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
+    if (left_right == onemkl::side::left)
+        rand_matrix(A, onemkl::transpose::nontrans, m, m, lda);
+    else
+        rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+    rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb);
+    rand_matrix(C, onemkl::transpose::nontrans, m, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference HEMM.
+    const int m_ref = m, n_ref = n;
+    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::hemm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref,
+           (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref,
+           (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
+
+    // Call DPC++ HEMM.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::hemm(main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda,
+                                  B.data(), ldb, beta, C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::hemm,
+                    (main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda, B.data(), ldb,
+                     beta, C.data(), ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during HEMM:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of HEMM:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(C, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout);
+
+    return (int)good;
+}
+
+class HemmUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(HemmUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+}
+TEST_P(HemmUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+}
+
+INSTANTIATE_TEST_SUITE_P(HemmUsmTestSuite, HemmUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/her2k.cpp b/tests/unit_tests/blas/level3/her2k.cpp
index 202ec860e..e7a208a9c 100644
--- a/tests/unit_tests/blas/level3/her2k.cpp
+++ b/tests/unit_tests/blas/level3/her2k.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_scalar>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
-          int lda, int ldb, int ldc, fp alpha, fp_scalar beta) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldb, int ldc, fp alpha, fp_scalar beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
     rand_matrix(A, trans, n, k, lda);
@@ -102,6 +102,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of HER2K:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -109,7 +117,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
         good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class Her2kTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -117,34 +125,34 @@ class Her2kTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(Her2kTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     float beta(1.0);
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
-                                                  onemkl::transpose::nontrans, 72, 27, 101, 102,
-                                                  103, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
-                                                  onemkl::transpose::nontrans, 72, 27, 101, 102,
-                                                  103, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
-                                                  onemkl::transpose::conjtrans, 72, 27, 101, 102,
-                                                  103, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
-                                                  onemkl::transpose::conjtrans, 72, 27, 101, 102,
-                                                  103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
 }
 TEST_P(Her2kTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     double beta(1.0);
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
-                                                    onemkl::transpose::nontrans, 72, 27, 101, 102,
-                                                    103, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
-                                                    onemkl::transpose::nontrans, 72, 27, 101, 102,
-                                                    103, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
-                                                    onemkl::transpose::conjtrans, 72, 27, 101, 102,
-                                                    103, alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
-                                                    onemkl::transpose::conjtrans, 72, 27, 101, 102,
-                                                    103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
 }
 
 INSTANTIATE_TEST_SUITE_P(Her2kTestSuite, Her2kTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/her2k_usm.cpp b/tests/unit_tests/blas/level3/her2k_usm.cpp
new file mode 100644
index 000000000..7f66a777a
--- /dev/null
+++ b/tests/unit_tests/blas/level3/her2k_usm.cpp
@@ -0,0 +1,160 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_scalar>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldb, int ldc, fp alpha, fp_scalar beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during HER2K:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
+    rand_matrix(A, trans, n, k, lda);
+    rand_matrix(B, trans, n, k, ldb);
+    rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference HER2K.
+    const int n_ref = n, k_ref = k;
+    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
+
+    using fp_ref        = typename ref_type_info<fp>::type;
+    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
+
+    ::her2k(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref,
+            (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref,
+            (fp_scalar_mkl*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
+
+    // Call DPC++ HER2K.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::her2k(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda,
+                                   B.data(), ldb, beta, C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::her2k,
+                    (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, B.data(), ldb,
+                     beta, C.data(), ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during HER2K:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of HER2K:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
+
+    return (int)good;
+}
+
+class Her2kUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(Her2kUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    float beta(1.0);
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        102, 103, alpha, beta)));
+}
+TEST_P(Her2kUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    double beta(1.0);
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          102, 103, alpha, beta)));
+}
+
+INSTANTIATE_TEST_SUITE_P(Her2kUsmTestSuite, Her2kUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/herk.cpp b/tests/unit_tests/blas/level3/herk.cpp
index 6e414156c..143589bc4 100644
--- a/tests/unit_tests/blas/level3/herk.cpp
+++ b/tests/unit_tests/blas/level3/herk.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp, typename fp_scalar>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
-          int lda, int ldc, fp_scalar alpha, fp_scalar beta) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldc, fp_scalar alpha, fp_scalar beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, C, C_ref;
     rand_matrix(A, trans, n, k, lda);
@@ -99,6 +99,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of HERK:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -106,7 +114,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
         good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class HerkTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -114,34 +122,34 @@ class HerkTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(HerkTests, ComplexSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
-                                                  onemkl::transpose::nontrans, 72, 27, 101, 103,
-                                                  alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
-                                                  onemkl::transpose::nontrans, 72, 27, 101, 103,
-                                                  alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
-                                                  onemkl::transpose::conjtrans, 72, 27, 101, 103,
-                                                  alpha, beta)));
-    EXPECT_TRUE((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
-                                                  onemkl::transpose::conjtrans, 72, 27, 101, 103,
-                                                  alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        103, alpha, beta)));
 }
 TEST_P(HerkTests, ComplexDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
-                                                    onemkl::transpose::nontrans, 72, 27, 101, 103,
-                                                    alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
-                                                    onemkl::transpose::nontrans, 72, 27, 101, 103,
-                                                    alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
-                                                    onemkl::transpose::conjtrans, 72, 27, 101, 103,
-                                                    alpha, beta)));
-    EXPECT_TRUE((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
-                                                    onemkl::transpose::conjtrans, 72, 27, 101, 103,
-                                                    alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          103, alpha, beta)));
 }
 
 INSTANTIATE_TEST_SUITE_P(HerkTestSuite, HerkTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/herk_usm.cpp b/tests/unit_tests/blas/level3/herk_usm.cpp
new file mode 100644
index 000000000..82d42140e
--- /dev/null
+++ b/tests/unit_tests/blas/level3/herk_usm.cpp
@@ -0,0 +1,158 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp, typename fp_scalar>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldc, fp_scalar alpha, fp_scalar beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during HERK:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), C(ua);
+    rand_matrix(A, trans, n, k, lda);
+    rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference HERK.
+    const int n_ref = n, k_ref = k;
+    const int lda_ref = lda, ldc_ref = ldc;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::herk(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref,
+           (fp_scalar*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_scalar*)&beta,
+           (fp_ref*)C_ref.data(), &ldc_ref);
+
+    // Call DPC++ HERK.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::herk(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta,
+                                  C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::herk,
+                    (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta, C.data(),
+                     ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during HERK:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of HERK:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
+
+    return (int)good;
+}
+
+class HerkUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(HerkUsmTests, ComplexSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::nontrans, 72, 27, 101,
+                                                        103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::lower,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(GetParam(), onemkl::uplo::upper,
+                                                        onemkl::transpose::conjtrans, 72, 27, 101,
+                                                        103, alpha, beta)));
+}
+TEST_P(HerkUsmTests, ComplexDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::nontrans, 72, 27, 101,
+                                                          103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::lower,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          103, alpha, beta)));
+    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(GetParam(), onemkl::uplo::upper,
+                                                          onemkl::transpose::conjtrans, 72, 27, 101,
+                                                          103, alpha, beta)));
+}
+
+INSTANTIATE_TEST_SUITE_P(HerkUsmTestSuite, HerkUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/symm.cpp b/tests/unit_tests/blas/level3/symm.cpp
index aa75d51c4..48586680c 100644
--- a/tests/unit_tests/blas/level3/symm.cpp
+++ b/tests/unit_tests/blas/level3/symm.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n,
-          int lda, int ldb, int ldc, fp alpha, fp beta) {
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
     if (left_right == onemkl::side::left)
@@ -104,6 +104,14 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SYMM:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -111,7 +119,7 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
         good = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SymmTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -119,50 +127,50 @@ class SymmTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(SymmTests, RealSinglePrecision) {
     float alpha(2.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102,
-                            103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102,
-                            103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102,
-                            103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102,
-                            103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101,
+                                  102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101,
+                                  102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101,
+                                  102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101,
+                                  102, 103, alpha, beta));
 }
 TEST_P(SymmTests, RealDoublePrecision) {
     double alpha(2.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102,
-                             103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102,
-                             103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102,
-                             103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102,
-                             103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101,
+                                   102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27,
+                                   101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101,
+                                   102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27,
+                                   101, 102, 103, alpha, beta));
 }
 TEST_P(SymmTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72,
-                                          27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72,
-                                          27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72,
-                                          27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72,
-                                          27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
 }
 TEST_P(SymmTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72,
-                                           27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72,
-                                           27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72,
-                                           27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72,
-                                           27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
 }
 
 INSTANTIATE_TEST_SUITE_P(SymmTestSuite, SymmTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/symm_usm.cpp b/tests/unit_tests/blas/level3/symm_usm.cpp
new file mode 100644
index 000000000..b2cadde68
--- /dev/null
+++ b/tests/unit_tests/blas/level3/symm_usm.cpp
@@ -0,0 +1,178 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during SYMM:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
+    if (left_right == onemkl::side::left)
+        rand_matrix(A, onemkl::transpose::nontrans, m, m, lda);
+    else
+        rand_matrix(A, onemkl::transpose::nontrans, n, n, lda);
+    rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb);
+    rand_matrix(C, onemkl::transpose::nontrans, m, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference SYMM.
+    const int m_ref = m, n_ref = n;
+    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::symm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref,
+           (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref,
+           (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
+
+    // Call DPC++ SYMM.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::symm(main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda,
+                                  B.data(), ldb, beta, C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::symm,
+                    (main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda, B.data(), ldb,
+                     beta, C.data(), ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during SYMM:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SYMM:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(C, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout);
+
+    return (int)good;
+}
+
+class SymmUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SymmUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101,
+                                  102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101,
+                                  102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101,
+                                  102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101,
+                                  102, 103, alpha, beta));
+}
+TEST_P(SymmUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101,
+                                   102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27,
+                                   101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101,
+                                   102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27,
+                                   101, 102, 103, alpha, beta));
+}
+TEST_P(SymmUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+}
+TEST_P(SymmUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta));
+}
+
+INSTANTIATE_TEST_SUITE_P(SymmUsmTestSuite, SymmUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/syr2k.cpp b/tests/unit_tests/blas/level3/syr2k.cpp
index 98ea7332d..ade60a741 100644
--- a/tests/unit_tests/blas/level3/syr2k.cpp
+++ b/tests/unit_tests/blas/level3/syr2k.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
-          int lda, int ldb, int ldc, fp alpha, fp beta) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
     rand_matrix(A, trans, n, k, lda);
@@ -101,6 +101,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SYR2K:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -108,7 +116,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
         good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class Syr2kTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -116,56 +124,58 @@ class Syr2kTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(Syr2kTests, RealSinglePrecision) {
     float alpha(3.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27,
-                            101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27,
-                            101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101,
-                            102, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101,
-                            102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                  27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                  27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27,
+                                  101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27,
+                                  101, 102, 103, alpha, beta));
 }
 TEST_P(Syr2kTests, RealDoublePrecision) {
     double alpha(3.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27,
-                             101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27,
-                             101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101,
-                             102, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101,
-                             102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                   27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                   27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73,
+                                   27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73,
+                                   27, 101, 102, 103, alpha, beta));
 }
 TEST_P(Syr2kTests, ComplexSinglePrecision) {
     std::complex<float> alpha(3.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          73, 27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
 }
 TEST_P(Syr2kTests, ComplexDoublePrecision) {
     std::complex<double> alpha(3.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
-                                           alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::trans, 73, 27, 101, 102, 103, alpha,
-                                           beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::trans, 73, 27, 101, 102, 103, alpha,
-                                           beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
 }
 
 INSTANTIATE_TEST_SUITE_P(Syr2kTestSuite, Syr2kTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/syr2k_usm.cpp b/tests/unit_tests/blas/level3/syr2k_usm.cpp
new file mode 100644
index 000000000..df6432083
--- /dev/null
+++ b/tests/unit_tests/blas/level3/syr2k_usm.cpp
@@ -0,0 +1,183 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldb, int ldc, fp alpha, fp beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during SYR2K:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
+    rand_matrix(A, trans, n, k, lda);
+    rand_matrix(B, trans, n, k, ldb);
+    rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference SYR2K.
+    const int n_ref = n, k_ref = k;
+    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::syr2k(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref,
+            (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref,
+            (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
+
+    // Call DPC++ SYR2K.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::syr2k(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda,
+                                   B.data(), ldb, beta, C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::syr2k,
+                    (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, B.data(), ldb,
+                     beta, C.data(), ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during SYR2K:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SYR2K:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
+
+    return (int)good;
+}
+
+class Syr2kUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(Syr2kUsmTests, RealSinglePrecision) {
+    float alpha(3.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                  27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                  27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27,
+                                  101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27,
+                                  101, 102, 103, alpha, beta));
+}
+TEST_P(Syr2kUsmTests, RealDoublePrecision) {
+    double alpha(3.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                   27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                   27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73,
+                                   27, 101, 102, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73,
+                                   27, 101, 102, 103, alpha, beta));
+}
+TEST_P(Syr2kUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(3.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                alpha, beta));
+}
+TEST_P(Syr2kUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(3.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::trans, 73, 27, 101, 102, 103,
+                                                 alpha, beta));
+}
+
+INSTANTIATE_TEST_SUITE_P(Syr2kUsmTestSuite, Syr2kUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/syrk.cpp b/tests/unit_tests/blas/level3/syrk.cpp
index 05b4d9e8c..66df33a3f 100644
--- a/tests/unit_tests/blas/level3/syrk.cpp
+++ b/tests/unit_tests/blas/level3/syrk.cpp
@@ -44,8 +44,8 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
-          int lda, int ldc, fp alpha, fp beta) {
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldc, fp alpha, fp beta) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, C, C_ref;
     rand_matrix(A, trans, n, k, lda);
@@ -99,6 +99,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SYRK:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -106,7 +114,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans,
         good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class SyrkTests : public ::testing::TestWithParam<cl::sycl::device> {};
@@ -114,53 +122,53 @@ class SyrkTests : public ::testing::TestWithParam<cl::sycl::device> {};
 TEST_P(SyrkTests, RealSinglePrecision) {
     float alpha(3.0);
     float beta(3.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27,
-                            101, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27,
-                            101, 103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101,
-                            103, alpha, beta));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101,
-                            103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                  27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                  27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27,
+                                  101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27,
+                                  101, 103, alpha, beta));
 }
 TEST_P(SyrkTests, RealDoublePrecision) {
     double alpha(3.0);
     double beta(3.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27,
-                             101, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27,
-                             101, 103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101,
-                             103, alpha, beta));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101,
-                             103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                   27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                   27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73,
+                                   27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73,
+                                   27, 101, 103, alpha, beta));
 }
 TEST_P(SyrkTests, ComplexSinglePrecision) {
     std::complex<float> alpha(3.0, -0.5);
     std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, 73, 27, 101, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, 73, 27, 101, 103, alpha,
-                                          beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans,
-                                          73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans,
-                                          73, 27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
 }
 TEST_P(SyrkTests, ComplexDoublePrecision) {
     std::complex<double> alpha(3.0, -0.5);
     std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, 73, 27, 101, 103, alpha,
-                                           beta));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, 73, 27, 101, 103, alpha,
-                                           beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUE(test<std::complex<double>>(
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
         GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
 }
 
diff --git a/tests/unit_tests/blas/level3/syrk_usm.cpp b/tests/unit_tests/blas/level3/syrk_usm.cpp
new file mode 100644
index 000000000..8f2463f5e
--- /dev/null
+++ b/tests/unit_tests/blas/level3/syrk_usm.cpp
@@ -0,0 +1,178 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k,
+         int lda, int ldc, fp alpha, fp beta) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during SYRK:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), C(ua);
+    rand_matrix(A, trans, n, k, lda);
+    rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc);
+
+    auto C_ref = C;
+
+    // Call Reference SYRK.
+    const int n_ref = n, k_ref = k;
+    const int lda_ref = lda, ldc_ref = ldc;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::syrk(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref,
+           (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(),
+           &ldc_ref);
+
+    // Call DPC++ SYRK.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::syrk(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta,
+                                  C.data(), ldc, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::syrk,
+                    (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta, C.data(),
+                     ldc, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during SYRK:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of SYRK:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout);
+
+    return (int)good;
+}
+
+class SyrkUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(SyrkUsmTests, RealSinglePrecision) {
+    float alpha(3.0);
+    float beta(3.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                  27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                  27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27,
+                                  101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27,
+                                  101, 103, alpha, beta));
+}
+TEST_P(SyrkUsmTests, RealDoublePrecision) {
+    double alpha(3.0);
+    double beta(3.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73,
+                                   27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73,
+                                   27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73,
+                                   27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73,
+                                   27, 101, 103, alpha, beta));
+}
+TEST_P(SyrkUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(3.0, -0.5);
+    std::complex<float> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
+}
+TEST_P(SyrkUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(3.0, -0.5);
+    std::complex<double> beta(3.0, -1.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::lower,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::uplo::upper,
+                                                 onemkl::transpose::nontrans, 73, 27, 101, 103,
+                                                 alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(
+        GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
+}
+
+INSTANTIATE_TEST_SUITE_P(SyrkUsmTestSuite, SyrkUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/trmm.cpp b/tests/unit_tests/blas/level3/trmm.cpp
index abe25fdbd..b7395db68 100644
--- a/tests/unit_tests/blas/level3/trmm.cpp
+++ b/tests/unit_tests/blas/level3/trmm.cpp
@@ -44,9 +44,9 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb,
-          fp alpha) {
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
+         onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb,
+         fp alpha) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, B_ref;
     if (left_right == onemkl::side::right)
@@ -104,6 +104,14 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRMM:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -111,166 +119,166 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
         good = check_equal_matrix(B_accessor, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TrmmTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TrmmTests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
 }
 TEST_P(TrmmTests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
 }
 TEST_P(TrmmTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
 }
 TEST_P(TrmmTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
 }
 
 INSTANTIATE_TEST_SUITE_P(TrmmTestSuite, TrmmTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/trmm_usm.cpp b/tests/unit_tests/blas/level3/trmm_usm.cpp
new file mode 100644
index 000000000..991dac0f9
--- /dev/null
+++ b/tests/unit_tests/blas/level3/trmm_usm.cpp
@@ -0,0 +1,287 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
+         onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb,
+         fp alpha) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TRMM:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua);
+    if (left_right == onemkl::side::right)
+        rand_matrix(A, transa, n, n, lda);
+    else
+        rand_matrix(A, transa, m, m, lda);
+
+    rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb);
+
+    auto B_ref = B;
+
+    // Call Reference TRMM.
+    const int m_ref = m, n_ref = n;
+    const int lda_ref = lda, ldb_ref = ldb;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::trmm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower),
+           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref,
+           (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref);
+
+    // Call DPC++ TRMM.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::trmm(main_queue, left_right, upper_lower, transa, unit_nonunit, m, n,
+                                  alpha, A.data(), lda, B.data(), ldb, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::trmm,
+                    (main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
+                     A.data(), lda, B.data(), ldb, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TRMM:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRMM:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_matrix(B, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout);
+
+    return (int)good;
+}
+
+class TrmmUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TrmmUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+}
+TEST_P(TrmmUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+}
+TEST_P(TrmmUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+}
+TEST_P(TrmmUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+}
+
+INSTANTIATE_TEST_SUITE_P(TrmmUsmTestSuite, TrmmUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/trsm.cpp b/tests/unit_tests/blas/level3/trsm.cpp
index e742fcbee..3dbfbbcc2 100644
--- a/tests/unit_tests/blas/level3/trsm.cpp
+++ b/tests/unit_tests/blas/level3/trsm.cpp
@@ -44,9 +44,9 @@ extern std::vector<cl::sycl::device> devices;
 namespace {
 
 template <typename fp>
-bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
-          onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb,
-          fp alpha) {
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
+         onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb,
+         fp alpha) {
     // Prepare data.
     vector<fp, allocator_helper<fp, 64>> A, B, B_ref;
     if (left_right == onemkl::side::right)
@@ -104,6 +104,14 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
                   << "OpenCL status: " << e.get_cl_code() << std::endl;
     }
 
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRSM:\n" << error.what() << std::endl;
+    }
+
     // Compare the results of reference implementation and DPC++ implementation.
     bool good;
     {
@@ -112,258 +120,262 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
             check_equal_trsm_matrix(B_accessor, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout);
     }
 
-    return good;
+    return (int)good;
 }
 
 class TrsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
 
 TEST_P(TrsmTests, RealSinglePrecision) {
     float alpha(2.0);
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                            onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                            onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                            onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                            onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                            onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                            onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                            onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
-    EXPECT_TRUE(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                            onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                            alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
 }
 TEST_P(TrsmTests, RealDoublePrecision) {
     double alpha(2.0);
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                             onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                             onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                             onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                             onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                             onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                             onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                             onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
-    EXPECT_TRUE(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                             onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
-                             alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
 }
 TEST_P(TrsmTests, ComplexSinglePrecision) {
     std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101,
-                                          102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101,
-                                          102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101,
-                                          102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101,
-                                          102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                          101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<float>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                          onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                          27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::trans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::trans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
 }
 TEST_P(TrsmTests, ComplexDoublePrecision) {
     std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::nontrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27,
-                                           101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
-    EXPECT_TRUE(test<std::complex<double>>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
-                                           onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72,
-                                           27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
 }
 
 INSTANTIATE_TEST_SUITE_P(TrsmTestSuite, TrsmTests, ::testing::ValuesIn(devices),
diff --git a/tests/unit_tests/blas/level3/trsm_usm.cpp b/tests/unit_tests/blas/level3/trsm_usm.cpp
new file mode 100644
index 000000000..453095ccc
--- /dev/null
+++ b/tests/unit_tests/blas/level3/trsm_usm.cpp
@@ -0,0 +1,383 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include "cblas.h"
+#include "onemkl/detail/config.hpp"
+#include "onemkl/onemkl.hpp"
+#include "onemkl_blas_helper.hpp"
+#include "reference_blas_templates.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using namespace cl::sycl;
+using std::vector;
+
+extern std::vector<cl::sycl::device> devices;
+
+namespace {
+
+template <typename fp>
+int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower,
+         onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb,
+         fp alpha) {
+    // Catch asynchronous exceptions.
+    auto exception_handler = [](exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (exception const& e) {
+                std::cout << "Caught asynchronous SYCL exception during TRSM:\n"
+                          << e.what() << std::endl
+                          << "OpenCL status: " << e.get_cl_code() << std::endl;
+            }
+        }
+    };
+
+    queue main_queue(dev, exception_handler);
+    context cxt = main_queue.get_context();
+    event done;
+    std::vector<event> dependencies;
+
+    // Prepare data.
+    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, dev);
+    vector<fp, decltype(ua)> A(ua), B(ua);
+    if (left_right == onemkl::side::right)
+        rand_trsm_matrix(A, transa, n, n, lda);
+    else
+        rand_trsm_matrix(A, transa, m, m, lda);
+
+    rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb);
+
+    auto B_ref = B;
+
+    // Call Reference TRSM.
+    const int m_ref = m, n_ref = n;
+    const int lda_ref = lda, ldb_ref = ldb;
+
+    using fp_ref = typename ref_type_info<fp>::type;
+
+    ::trsm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower),
+           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref,
+           (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref);
+
+    // Call DPC++ TRSM.
+
+    try {
+#ifdef CALL_RT_API
+        done = onemkl::blas::trsm(main_queue, left_right, upper_lower, transa, unit_nonunit, m, n,
+                                  alpha, A.data(), lda, B.data(), ldb, dependencies);
+        done.wait();
+#else
+        TEST_RUN_CT(main_queue, onemkl::blas::trsm,
+                    (main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
+                     A.data(), lda, B.data(), ldb, dependencies));
+        main_queue.wait();
+#endif
+    }
+    catch (exception const& e) {
+        std::cout << "Caught synchronous SYCL exception during TRSM:\n"
+                  << e.what() << std::endl
+                  << "OpenCL status: " << e.get_cl_code() << std::endl;
+    }
+
+    catch (const onemkl::backend_unsupported_exception& e) {
+        return test_skipped;
+    }
+
+    catch (const std::runtime_error& error) {
+        std::cout << "Error raised during execution of TRSM:\n" << error.what() << std::endl;
+    }
+
+    // Compare the results of reference implementation and DPC++ implementation.
+
+    bool good = check_equal_trsm_matrix(B, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout);
+
+    return (int)good;
+}
+
+class TrsmUsmTests : public ::testing::TestWithParam<cl::sycl::device> {};
+
+TEST_P(TrsmUsmTests, RealSinglePrecision) {
+    float alpha(2.0);
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                  102, alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+    EXPECT_TRUEORSKIP(test<float>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                  onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102,
+                                  alpha));
+}
+TEST_P(TrsmUsmTests, RealDoublePrecision) {
+    double alpha(2.0);
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102,
+                                   alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::lower,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+    EXPECT_TRUEORSKIP(test<double>(GetParam(), onemkl::side::right, onemkl::uplo::upper,
+                                   onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101,
+                                   102, alpha));
+}
+TEST_P(TrsmUsmTests, ComplexSinglePrecision) {
+    std::complex<float> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::trans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::unit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::trans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::unit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::nontrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::trans, onemkl::diag::nonunit, 72,
+                                                27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::trans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::lower,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::left, onemkl::uplo::upper,
+                                                onemkl::transpose::conjtrans, onemkl::diag::nonunit,
+                                                72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<float>>(GetParam(), onemkl::side::right,
+                                                onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+}
+TEST_P(TrsmUsmTests, ComplexDoublePrecision) {
+    std::complex<double> alpha(2.0, -0.5);
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::unit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::nontrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::trans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::lower, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::left,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+    EXPECT_TRUEORSKIP(test<std::complex<double>>(GetParam(), onemkl::side::right,
+                                                 onemkl::uplo::upper, onemkl::transpose::conjtrans,
+                                                 onemkl::diag::nonunit, 72, 27, 101, 102, alpha));
+}
+
+INSTANTIATE_TEST_SUITE_P(TrsmUsmTestSuite, TrsmUsmTests, ::testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp
index 442dd712c..4e72d6ece 100644
--- a/tests/unit_tests/include/test_helper.hpp
+++ b/tests/unit_tests/include/test_helper.hpp
@@ -29,6 +29,19 @@
     #include <stdlib.h>
 #endif
 
+#define test_failed  0
+#define test_passed  1
+#define test_skipped 2
+
+#define EXPECT_TRUEORSKIP(a)             \
+    do {                                 \
+        int res = a;                     \
+        if (res == test_skipped)         \
+            GTEST_SKIP();                \
+        else                             \
+            EXPECT_EQ(res, test_passed); \
+    } while (0);
+
 #ifdef ENABLE_MKLCPU_BACKEND
     #define TEST_RUN_INTELCPU(q, func, args) \
         func<onemkl::library::intelmkl, onemkl::backend::intelcpu> args
@@ -95,6 +108,33 @@ static inline void aligned_free(void *p) {
     ::free(p);
 #endif
 }
+
+/* Support for Unified Shared Memory allocations for different backends */
+static inline void *malloc_shared(size_t align, size_t size, cl::sycl::device dev,
+                                  cl::sycl::context ctx) {
+#ifdef _WIN64
+    return cl::sycl::malloc_shared(size, dev, ctx);
+#else
+    #ifdef ENABLE_CUBLAS_BACKEND
+    return ::aligned_alloc(align, size);
+    #else
+    return cl::sycl::malloc_shared(size, dev, ctx);
+    #endif
+#endif
+}
+
+static inline void free_shared(void *p, cl::sycl::context ctx) {
+#ifdef _WIN64
+    cl::sycl::free(p, ctx);
+#else
+    #ifdef ENABLE_CUBLAS_BACKEND
+    ::free(p);
+    #else
+    cl::sycl::free(p, ctx);
+    #endif
+#endif
+}
+
 } // namespace onemkl
 
 #endif // _TEST_HELPER_HPP_

From 549da82131f066dd117e6470224a101e8da4affb Mon Sep 17 00:00:00 2001
From: "Meterelliyoz, Mesut" <mesut.meterelliyoz@intel.com>
Date: Wed, 27 May 2020 23:32:52 -0700
Subject: [PATCH 2/5] Fix missing symbol in windows

---
 .../mklgpu/mkl_internal_blas_gpu_wrappers.cpp | 16 +++----
 .../mklgpu/mkl_internal_blas_sycl_gpu.hpp     | 42 +++++++++----------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp
index ea02a5cc1..95bd9b73c 100644
--- a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp
+++ b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp
@@ -2559,8 +2559,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha
     std::int64_t total_group_size = 0;
     for (std::int64_t i = 0; i < group_count; i++) {
         cl::sycl::event *axpy_batch_event = new cl::sycl::event(
-            mkl::gpu::saxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
-                                  total_group_size, dependencies));
+            mkl::gpu::saxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i],
+                                       group_size[i], total_group_size, dependencies));
         coalesced_events.push_back(axpy_batch_event);
         total_group_size += group_size[i];
     }
@@ -2576,8 +2576,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alph
     std::int64_t total_group_size = 0;
     for (std::int64_t i = 0; i < group_count; i++) {
         cl::sycl::event *axpy_batch_event = new cl::sycl::event(
-            mkl::gpu::daxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
-                                  total_group_size, dependencies));
+            mkl::gpu::daxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i],
+                                       group_size[i], total_group_size, dependencies));
         coalesced_events.push_back(axpy_batch_event);
         total_group_size += group_size[i];
     }
@@ -2594,8 +2594,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex
     std::int64_t total_group_size = 0;
     for (std::int64_t i = 0; i < group_count; i++) {
         cl::sycl::event *axpy_batch_event = new cl::sycl::event(
-            mkl::gpu::caxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
-                                  total_group_size, dependencies));
+            mkl::gpu::caxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i],
+                                       group_size[i], total_group_size, dependencies));
         coalesced_events.push_back(axpy_batch_event);
         total_group_size += group_size[i];
     }
@@ -2612,8 +2612,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex
     std::int64_t total_group_size = 0;
     for (std::int64_t i = 0; i < group_count; i++) {
         cl::sycl::event *axpy_batch_event = new cl::sycl::event(
-            mkl::gpu::zaxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i],
-                                  total_group_size, dependencies));
+            mkl::gpu::zaxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i],
+                                       group_size[i], total_group_size, dependencies));
         coalesced_events.push_back(axpy_batch_event);
         total_group_size += group_size[i];
     }
diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp
index 6a68a73a0..33ea034ee 100644
--- a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp
+++ b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp
@@ -1575,27 +1575,27 @@ cl::sycl::event zgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TR
                             int64_t group_size,
                             const cl::sycl::vector_class<cl::sycl::event> &dependencies);
 
-cl::sycl::event saxpy_batch(cl::sycl::queue &queue, std::int64_t n, float alpha, const float **x,
-                            std::int64_t incx, float **y, std::int64_t incy,
-                            std::int64_t batch_size, std::int64_t offset,
-                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
-
-cl::sycl::event daxpy_batch(cl::sycl::queue &queue, std::int64_t n, double alpha, const double **x,
-                            std::int64_t incx, double **y, std::int64_t incy,
-                            std::int64_t batch_size, std::int64_t offset,
-                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
-
-cl::sycl::event caxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                            const std::complex<float> **x, std::int64_t incx,
-                            std::complex<float> **y, std::int64_t incy, std::int64_t batch_size,
-                            std::int64_t offset,
-                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
-
-cl::sycl::event zaxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                            const std::complex<double> **x, std::int64_t incx,
-                            std::complex<double> **y, std::int64_t incy, std::int64_t batch_size,
-                            std::int64_t offset,
-                            const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+cl::sycl::event saxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, float alpha,
+                                 const float **x, std::int64_t incx, float **y, std::int64_t incy,
+                                 std::int64_t batch_size, std::int64_t offset,
+                                 const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event daxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, double alpha,
+                                 const double **x, std::int64_t incx, double **y, std::int64_t incy,
+                                 std::int64_t batch_size, std::int64_t offset,
+                                 const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event caxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, std::complex<float> alpha,
+                                 const std::complex<float> **x, std::int64_t incx,
+                                 std::complex<float> **y, std::int64_t incy,
+                                 std::int64_t batch_size, std::int64_t offset,
+                                 const cl::sycl::vector_class<cl::sycl::event> &dependencies);
+
+cl::sycl::event zaxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, std::complex<double> alpha,
+                                 const std::complex<double> **x, std::int64_t incx,
+                                 std::complex<double> **y, std::int64_t incy,
+                                 std::int64_t batch_size, std::int64_t offset,
+                                 const cl::sycl::vector_class<cl::sycl::event> &dependencies);
 
 cl::sycl::event sgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa,
                             MKL_TRANSPOSE transb, int64_t n, int64_t k, float alpha, const float *a,

From f4f856cd5f40494aeba40b552eb86ec64fb27e5e Mon Sep 17 00:00:00 2001
From: "Meterelliyoz, Mesut" <mesut.meterelliyoz@intel.com>
Date: Thu, 28 May 2020 15:06:42 -0700
Subject: [PATCH 3/5] Fix link line for mklgpu backend

---
 cmake/FindMKL.cmake                     | 2 +-
 src/blas/backends/mklgpu/CMakeLists.txt | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
index 9a210fba7..9f9e0316c 100644
--- a/cmake/FindMKL.cmake
+++ b/cmake/FindMKL.cmake
@@ -85,7 +85,7 @@ if (ENABLE_MKLCPU_BACKEND OR ENABLE_MKLGPU_BACKEND)
     list(APPEND MKL_LINK_C ${TBB_LINK})
   endif()
   if(ENABLE_MKLGPU_BACKEND)
-    set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} ${SYCL_LINK_FLAGS})
+    set(MKL_LINK_SYCL ${SYCL_LINK_FLAGS} ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} )
   endif()
 endif()
 
diff --git a/src/blas/backends/mklgpu/CMakeLists.txt b/src/blas/backends/mklgpu/CMakeLists.txt
index f2f35c831..07fd972ab 100644
--- a/src/blas/backends/mklgpu/CMakeLists.txt
+++ b/src/blas/backends/mklgpu/CMakeLists.txt
@@ -45,6 +45,7 @@ set_target_properties(${LIB_OBJ} PROPERTIES
 )
 target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
 
+#Set libraries as not transitive for dynamic
 if(BUILD_SHARED_LIBS)
   set_target_properties(${LIB_NAME} PROPERTIES
     INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL

From a58b6e98ce38ea0e90e2f361ac8982ba3ec26f61 Mon Sep 17 00:00:00 2001
From: "Meterelliyoz, Mesut" <mesut.meterelliyoz@intel.com>
Date: Thu, 28 May 2020 15:25:57 -0700
Subject: [PATCH 4/5] Update sycl linker

---
 cmake/FindCompiler.cmake | 5 +++--
 cmake/FindMKL.cmake      | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake
index e9125af8f..2f5013e24 100644
--- a/cmake/FindCompiler.cmake
+++ b/cmake/FindCompiler.cmake
@@ -35,6 +35,7 @@ if(SYCL_FOUND AND is_dpcpp)
 
   add_library(ONEMKL::SYCL::SYCL INTERFACE IMPORTED)
   set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
-      INTERFACE_COMPILE_OPTIONS "-fsycl"
-      INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
+     INTERFACE_COMPILE_OPTIONS "-fsycl"
+     INTERFACE_LINK_OPTIONS "-fsycl"
+     INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
 endif()
diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
index 9f9e0316c..a358dd0c0 100644
--- a/cmake/FindMKL.cmake
+++ b/cmake/FindMKL.cmake
@@ -66,7 +66,6 @@ if(UNIX)
   list(APPEND MKL_LINK_PREFIX "-L${MKL_LIB_DIR}")
   set(LIB_PREFIX "-l")
   set(OPENCL_LIBNAME "OpenCL")
-  set(SYCL_LINK_FLAGS "-fsycl")
 else()
   if(${BUILD_SHARED_LIBS})
     set(MKL_COPT ${MKL_COPT} "-Donemkl_EXPORTS")
@@ -85,7 +84,7 @@ if (ENABLE_MKLCPU_BACKEND OR ENABLE_MKLGPU_BACKEND)
     list(APPEND MKL_LINK_C ${TBB_LINK})
   endif()
   if(ENABLE_MKLGPU_BACKEND)
-    set(MKL_LINK_SYCL ${SYCL_LINK_FLAGS} ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} )
+    set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} )
   endif()
 endif()
 

From ebb4d6a6efff8d122eec01b3a19842042767935f Mon Sep 17 00:00:00 2001
From: "Meterelliyoz, Mesut" <mesut.meterelliyoz@intel.com>
Date: Thu, 28 May 2020 21:33:06 -0700
Subject: [PATCH 5/5] Remove warnings due to fsycl in Windows

---
 cmake/FindCompiler.cmake | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake
index 2f5013e24..ab0504af8 100644
--- a/cmake/FindCompiler.cmake
+++ b/cmake/FindCompiler.cmake
@@ -34,8 +34,15 @@ if(SYCL_FOUND AND is_dpcpp)
   find_library(SYCL_LIBRARY NAMES sycl PATHS "${SYCL_BINARY_DIR}/../lib")
 
   add_library(ONEMKL::SYCL::SYCL INTERFACE IMPORTED)
-  set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
-     INTERFACE_COMPILE_OPTIONS "-fsycl"
-     INTERFACE_LINK_OPTIONS "-fsycl"
-     INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
+  if(UNIX)
+    set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
+      INTERFACE_COMPILE_OPTIONS "-fsycl"
+      INTERFACE_LINK_OPTIONS "-fsycl"
+      INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
+  else()
+    set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
+      INTERFACE_COMPILE_OPTIONS "-fsycl"
+      INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
+  endif()
+
 endif()