uxlfoundation · andrewtbarker · Nov 9, 2023 · Jun 23, 2023 · Jun 27, 2023 · Jul 15, 2023
@@ -345,6 +345,7 @@ OMATADD_LAUNCHER_USM(std::complex<double>, cublasZgeam)
 #undef OMATADD_LAUNCHER_USM
 
 } // namespace column_major
+
 namespace row_major {
 
 // Buffer APIs
@@ -405,27 +406,41 @@ void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose tra
     throw unimplemented("blas", "gemmt", "for row_major layout");
 }
 
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
+template <typename Func, typename T>
+void omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
+              int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
+              int64_t ldb) {
+    using cuDataType = typename CudaEquivalentType<T>::Type;
+    overflow_check(m, n, lda, ldb);
+    queue.submit([&](sycl::handler &cgh) {
+        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
+        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
+        const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m);
+        const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n);
+        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
+            auto handle = sc.get_handle(queue);
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            cublasStatus_t err;
+            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
+                                     get_cublas_operation(trans), logical_m, logical_n,
+                                     (cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb);
+        });
+    });
 }
 
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-}
+#define OMATCOPY_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                   \
+    void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,          \
+                  sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) { \
+        omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb);     \
+    }
 
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-}
+OMATCOPY_LAUNCHER(float, cublasSgeam)
+OMATCOPY_LAUNCHER(double, cublasDgeam)
+OMATCOPY_LAUNCHER(std::complex<float>, cublasCgeam)
+OMATCOPY_LAUNCHER(std::complex<double>, cublasZgeam)
 
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-}
+#undef OMATCOPY_LAUNCHER
 
 void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
               sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
@@ -447,31 +462,43 @@ void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::co
     throw unimplemented("blas", "imatcopy", "for row_major layout");
 }
 
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
-             sycl::buffer<float, 1> &b, int64_t ldb, sycl::buffer<float, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
+template <typename Func, typename T>
+void omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
+             transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
+             T beta, sycl::buffer<T, 1> &b, int64_t ldb, sycl::buffer<T, 1> &c, int64_t ldc) {
+    using cuDataType = typename CudaEquivalentType<T>::Type;
+    overflow_check(m, n, lda, ldb, ldc);
+    queue.submit([&](sycl::handler &cgh) {
+        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
+        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
+        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
+        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
+            auto handle = sc.get_handle(queue);
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
+            cublasStatus_t err;
+            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
+                                     get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_,
+                                     lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
+        });
+    });
 }
 
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
-             sycl::buffer<double, 1> &b, int64_t ldb, sycl::buffer<double, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-}
+#define OMATADD_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
+    void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,     \
+                 TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta,                     \
+                 sycl::buffer<TYPE, 1> &b, int64_t ldb, sycl::buffer<TYPE, 1> &c, int64_t ldc) {   \
+        omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, \
+                b, ldb, c, ldc);                                                                   \
+    }
 
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-             std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-}
+OMATADD_LAUNCHER(float, cublasSgeam)
+OMATADD_LAUNCHER(double, cublasDgeam)
+OMATADD_LAUNCHER(std::complex<float>, cublasCgeam)
+OMATADD_LAUNCHER(std::complex<double>, cublasZgeam)
 
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-             std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-}
+#undef OMATADD_LAUNCHER
 
 // USM APIs
 
@@ -537,31 +564,43 @@ sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transp
     throw unimplemented("blas", "gemmt", "for row_major layout");
 }
 
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     const float *a, int64_t lda, float *b, int64_t ldb,
+template <typename Func, typename T>
+sycl::event omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans,
+                     int64_t m, int64_t n, T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
+    using cuDataType = typename CudaEquivalentType<T>::Type;
+    overflow_check(m, n, lda, ldb);
+    auto done = queue.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependencies);
+        const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m);
+        const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n);
+        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
+            auto handle = sc.get_handle(queue);
+            auto a_ = reinterpret_cast<const cuDataType *>(a);
+            auto b_ = reinterpret_cast<cuDataType *>(b);
+            cublasStatus_t err;
+            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
+                                     get_cublas_operation(trans), logical_m, logical_n,
+                                     (cuDataType *)&alpha, a_, lda, nullptr, nullptr, ldb, b_, ldb);
+        });
+    });
+    return done;
 }
 
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     const double *a, int64_t lda, double *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-}
+#define OMATCOPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
+                         const TYPE *a, int64_t lda, TYPE *b, int64_t ldb,                      \
+                         const std::vector<sycl::event> &dependencies) {                        \
+        return omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b,  \
+                        ldb, dependencies);                                                     \
+    }
 
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                     std::complex<float> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-}
+OMATCOPY_LAUNCHER_USM(float, cublasSgeam)
+OMATCOPY_LAUNCHER_USM(double, cublasDgeam)
+OMATCOPY_LAUNCHER_USM(std::complex<float>, cublasCgeam)
+OMATCOPY_LAUNCHER_USM(std::complex<double>, cublasZgeam)
 
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                     std::complex<double> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-}
+#undef OMATCOPY_LAUNCHER_USM
 
 sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
                      float *ab, int64_t lda, int64_t ldb,
@@ -587,35 +626,44 @@ sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
     throw unimplemented("blas", "imatcopy", "for row_major layout");
 }
 
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    float alpha, const float *a, int64_t lda, float beta, const float *b,
-                    int64_t ldb, float *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
+template <typename Func, typename T>
+inline sycl::event omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
+                           transpose transb, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
+                           T beta, const T *b, int64_t ldb, T *c, int64_t ldc,
+                           const std::vector<sycl::event> &dependencies) {
+    using cuDataType = typename CudaEquivalentType<T>::Type;
+    overflow_check(m, n, lda, ldb, ldc);
+    auto done = queue.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependencies);
+        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
+            auto handle = sc.get_handle(queue);
+            auto a_ = reinterpret_cast<const cuDataType *>(a);
+            auto b_ = reinterpret_cast<const cuDataType *>(b);
+            auto c_ = reinterpret_cast<cuDataType *>(c);
+            cublasStatus_t err;
+            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
+                                     get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_,
+                                     lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
+        });
+    });
+    return done;
 }
 
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    double alpha, const double *a, int64_t lda, double beta, const double *b,
-                    int64_t ldb, double *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-}
+#define OMATADD_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
+    sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m,     \
+                        int64_t n, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta,          \
+                        const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc,                      \
+                        const std::vector<sycl::event> &dependencies) {                        \
+        return omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, \
+                       lda, beta, b, ldb, c, ldc, dependencies);                               \
+    }
 
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                    std::complex<float> beta, const std::complex<float> *b, int64_t ldb,
-                    std::complex<float> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-}
+OMATADD_LAUNCHER_USM(float, cublasSgeam)
+OMATADD_LAUNCHER_USM(double, cublasDgeam)
+OMATADD_LAUNCHER_USM(std::complex<float>, cublasCgeam)
+OMATADD_LAUNCHER_USM(std::complex<double>, cublasZgeam)
 
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                    std::complex<double> beta, const std::complex<double> *b, int64_t ldb,
-                    std::complex<double> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-}
+#undef OMATADD_LAUNCHER_USM
 
 } // namespace row_major
 } // namespace cublas