Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 131 additions & 83 deletions src/blas/backends/cublas/cublas_extensions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ OMATADD_LAUNCHER_USM(std::complex<double>, cublasZgeam)
#undef OMATADD_LAUNCHER_USM

} // namespace column_major

namespace row_major {

// Buffer APIs
Expand Down Expand Up @@ -405,27 +406,41 @@ void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose tra
throw unimplemented("blas", "gemmt", "for row_major layout");
}

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
template <typename Func, typename T>
void omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
int64_t ldb) {
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb);
queue.submit([&](sycl::handler &cgh) {
auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m);
const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = sc.get_mem<cuDataType *>(a_acc);
auto b_ = sc.get_mem<cuDataType *>(b_acc);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
get_cublas_operation(trans), logical_m, logical_n,
(cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb);
});
});
}

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
}
#define OMATCOPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \
void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) { \
omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb); \
}

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
}
OMATCOPY_LAUNCHER(float, cublasSgeam)
OMATCOPY_LAUNCHER(double, cublasDgeam)
OMATCOPY_LAUNCHER(std::complex<float>, cublasCgeam)
OMATCOPY_LAUNCHER(std::complex<double>, cublasZgeam)

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
}
#undef OMATCOPY_LAUNCHER

void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
Expand All @@ -447,31 +462,43 @@ void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::co
throw unimplemented("blas", "imatcopy", "for row_major layout");
}

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
sycl::buffer<float, 1> &b, int64_t ldb, sycl::buffer<float, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for row_major layout");
template <typename Func, typename T>
void omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
T beta, sycl::buffer<T, 1> &b, int64_t ldb, sycl::buffer<T, 1> &c, int64_t ldc) {
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb, ldc);
queue.submit([&](sycl::handler &cgh) {
auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = sc.get_mem<cuDataType *>(a_acc);
auto b_ = sc.get_mem<cuDataType *>(b_acc);
auto c_ = sc.get_mem<cuDataType *>(c_acc);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_,
lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
});
});
}

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
sycl::buffer<double, 1> &b, int64_t ldb, sycl::buffer<double, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for row_major layout");
}
#define OMATADD_LAUNCHER(TYPE, CUBLAS_ROUTINE) \
void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta, \
sycl::buffer<TYPE, 1> &b, int64_t ldb, sycl::buffer<TYPE, 1> &c, int64_t ldc) { \
omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, \
b, ldb, c, ldc); \
}

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for row_major layout");
}
OMATADD_LAUNCHER(float, cublasSgeam)
OMATADD_LAUNCHER(double, cublasDgeam)
OMATADD_LAUNCHER(std::complex<float>, cublasCgeam)
OMATADD_LAUNCHER(std::complex<double>, cublasZgeam)

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for row_major layout");
}
#undef OMATADD_LAUNCHER

// USM APIs

Expand Down Expand Up @@ -537,31 +564,43 @@ sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transp
throw unimplemented("blas", "gemmt", "for row_major layout");
}

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
const float *a, int64_t lda, float *b, int64_t ldb,
template <typename Func, typename T>
sycl::event omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans,
int64_t m, int64_t n, T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb);
auto done = queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(dependencies);
const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m);
const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = reinterpret_cast<const cuDataType *>(a);
auto b_ = reinterpret_cast<cuDataType *>(b);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
get_cublas_operation(trans), logical_m, logical_n,
(cuDataType *)&alpha, a_, lda, nullptr, nullptr, ldb, b_, ldb);
});
});
return done;
}

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
const double *a, int64_t lda, double *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
}
#define OMATCOPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \
sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \
const std::vector<sycl::event> &dependencies) { \
return omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, \
ldb, dependencies); \
}

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
std::complex<float> *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
}
OMATCOPY_LAUNCHER_USM(float, cublasSgeam)
OMATCOPY_LAUNCHER_USM(double, cublasDgeam)
OMATCOPY_LAUNCHER_USM(std::complex<float>, cublasCgeam)
OMATCOPY_LAUNCHER_USM(std::complex<double>, cublasZgeam)

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
std::complex<double> *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for row_major layout");
}
#undef OMATCOPY_LAUNCHER_USM

sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
float *ab, int64_t lda, int64_t ldb,
Expand All @@ -587,35 +626,44 @@ sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
throw unimplemented("blas", "imatcopy", "for row_major layout");
}

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
float alpha, const float *a, int64_t lda, float beta, const float *b,
int64_t ldb, float *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for row_major layout");
template <typename Func, typename T>
inline sycl::event omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
transpose transb, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
T beta, const T *b, int64_t ldb, T *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb, ldc);
auto done = queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(dependencies);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = reinterpret_cast<const cuDataType *>(a);
auto b_ = reinterpret_cast<const cuDataType *>(b);
auto c_ = reinterpret_cast<cuDataType *>(c);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_,
lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
});
});
return done;
}

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
double alpha, const double *a, int64_t lda, double beta, const double *b,
int64_t ldb, double *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for row_major layout");
}
#define OMATADD_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \
sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \
int64_t n, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, \
const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc, \
const std::vector<sycl::event> &dependencies) { \
return omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, \
lda, beta, b, ldb, c, ldc, dependencies); \
}

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
std::complex<float> beta, const std::complex<float> *b, int64_t ldb,
std::complex<float> *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for row_major layout");
}
OMATADD_LAUNCHER_USM(float, cublasSgeam)
OMATADD_LAUNCHER_USM(double, cublasDgeam)
OMATADD_LAUNCHER_USM(std::complex<float>, cublasCgeam)
OMATADD_LAUNCHER_USM(std::complex<double>, cublasZgeam)

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
std::complex<double> beta, const std::complex<double> *b, int64_t ldb,
std::complex<double> *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for row_major layout");
}
#undef OMATADD_LAUNCHER_USM

} // namespace row_major
} // namespace cublas
Expand Down