diff --git a/Makefile.kokkos-kernels b/Makefile.kokkos-kernels index dcccd5cc0b..604efb9e0b 100644 --- a/Makefile.kokkos-kernels +++ b/Makefile.kokkos-kernels @@ -23,7 +23,7 @@ KOKKOSKERNELS_INTERNAL_ENABLE_CUSPARSE := $(strip $(shell echo $(KOKKOSKERNELS_E KOKKOSKERNELS_INTERNAL_ENABLE_CUBLAS := $(strip $(shell echo $(KOKKOSKERNELS_ENABLE_TPLS) | grep "cublas" | wc -l)) BLAS_PATH ?= /usr -BLAS_LIBS ?= -L${BLAS_PATH}/lib64 -lblas +BLAS_LIBS ?= -L${BLAS_PATH}/lib64 $(BLAS_LIBRARIES) -lgfortran -lgomp MKL_PATH ?= ${SEMS_INTEL_ROOT}/mkl MKL_LIBS ?= -L${MKL_PATH}/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl -L${MKL_PATH}/../compiler/lib/intel64 @@ -386,6 +386,10 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_INST_EXECSPACE_CUDA), 1) endif endif +ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_CUBLAS}, 1) + KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp +endif + KOKKOSKERNELS_INTERNAL_HEADERS = $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/*.hpp) KOKKOSKERNELS_INTERNAL_HEADERS += $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/generated_specializations_hpp/*/*eti_spec*.hpp) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 6083785c2c..9a49ff8ecc 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -248,6 +248,9 @@ elif [ "$MACHINE" = "white" ]; then CUDA_MODULE_LIST="/,gcc/5.4.0" CUDA_MODULE_LIST2="/,gcc/6.3.0,ibm/xl/13.1.6" + module load netlib + export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a" + # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 16db928261..627bfafa0d 100644 --- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -71,10 +71,47 @@ struct axpby_tpl_spec_avail< \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; +#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( SCALAR , LAYOUT, MEMSPACE ) \ +template \ +struct axpby_tpl_spec_avail< \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { enum : bool { value = true }; }; + +#if defined (KOKKOSKERNELS_INST_DOUBLE) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) +KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif #endif } diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index e33e1f22b7..e34c9d59ce 100644 --- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -213,4 +213,161 @@ KOKKOSBLAS1_CAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false) #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_DAXPBY_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Axpby< \ + double, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + double, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef double AV; \ + typedef double BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > YV; \ + typedef typename XV::size_type size_type; \ +\ + static void \ + axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + const size_type numElems = X.extent(0); \ + if((numElems < static_cast (INT_MAX)) && (beta == 1.0)) { \ + axpby_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ + } else \ + Axpby::axpby(alpha,X,beta,Y); \ + } \ +}; + + +#define KOKKOSBLAS1_SAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \ +template \ +struct Axpby< \ + float, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + float, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef float AV; \ + typedef float BV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > XV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > YV; \ + typedef typename XV::size_type size_type; \ +\ + static void \ + axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + const size_type numElems = X.extent(0); \ + if((numElems < static_cast (INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ + } else \ + Axpby::axpby(alpha,X,beta,Y); \ + } \ +}; + +#define KOKKOSBLAS1_ZAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \ +template \ +struct Axpby< \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > YV; \ + typedef typename XV::size_type size_type; \ +\ + static void \ + axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + const size_type numElems = X.extent(0); \ + if((numElems < static_cast (INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one); \ + } else \ + Axpby::axpby(alpha,X,beta,Y); \ + } \ +}; + +#define KOKKOSBLAS1_CAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \ +template \ +struct Axpby< \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > YV; \ + typedef typename XV::size_type size_type; \ +\ + static void \ + axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + const size_type numElems = X.extent(0); \ + if((numElems < static_cast (INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one); \ + } else \ + Axpby::axpby(alpha,X,beta,Y); \ + } \ +}; + +KOKKOSBLAS1_DAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +#undef KOKKOSBLAS1_DAXPBY_CUBLAS +#undef KOKKOSBLAS1_SAXPBY_CUBLAS +#undef KOKKOSBLAS1_ZAXPBY_CUBLAS +#undef KOKKOSBLAS1_CAXPBY_CUBLAS +} +} + +#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS + #endif diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 5439f74a33..c9b7a5532b 100644 --- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -71,11 +71,47 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1,1> { enum : bool { value = true }; }; +#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif + +#endif +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +// double +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \ +template \ +struct dot_tpl_spec_avail< \ +Kokkos::View >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +1,1> { enum : bool { value = true }; }; + +#if defined (KOKKOSKERNELS_INST_DOUBLE) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index f574ee6a3b..53f51a60da 100644 --- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -212,4 +212,156 @@ KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, fals #endif +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Dot< \ +Kokkos::View >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +1,1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot (RV& R, const XV& X, const XV& Y) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + } else { \ + Dot::dot(R,X,Y); \ + } \ + } \ +}; + +#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Dot< \ +Kokkos::View >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +1,1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot (RV& R, const XV& X, const XV& Y) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + } else { \ + Dot::dot(R,X,Y); \ + } \ + } \ +}; + +#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Dot< \ +Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ +Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ +Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ +1,1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot (RV& R, const XV& X, const XV& Y) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasZdotu(s.handle, N, reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, reinterpret_cast(&R())); \ + } else { \ + Dot::dot(R,X,Y); \ + } \ + } \ +}; + +#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Dot< \ +Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ +Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ +Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ +1,1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot (RV& R, const XV& X, const XV& Y) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasCdotu(s.handle, N, reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, reinterpret_cast(&R())); \ + } else { \ + Dot::dot(R,X,Y); \ + } \ + } \ +}; + +KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +} +} + +#endif + #endif diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index bd4041f499..46c3915b0b 100644 --- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -69,14 +69,49 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; +#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +// double +#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \ +template \ +struct nrm2_tpl_spec_avail< \ +Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +1> { enum : bool { value = true }; }; +#if defined (KOKKOSKERNELS_INST_DOUBLE) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif #endif + } } #endif diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index d2b28e8b5e..2a4facce96 100644 --- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -202,4 +202,152 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, fal #endif +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Nrm2< \ +Kokkos::View >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasDnrm2(s.handle, N, X.data(), one, &R()); \ + if(!take_sqrt) R() = R()*R(); \ + } else { \ + Nrm2::nrm2(R,X,take_sqrt); \ + } \ + } \ +}; + +#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Nrm2< \ +Kokkos::View >, \ +Kokkos::View, \ + Kokkos::MemoryTraits >, \ +1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasSnrm2(s.handle, N, X.data(), one, &R()); \ + if(!take_sqrt) R() = R()*R(); \ + } else { \ + Nrm2::nrm2(R,X,take_sqrt); \ + } \ + } \ +}; + +#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Nrm2< \ +Kokkos::View >, \ +Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ +1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasDznrm2(s.handle, N, reinterpret_cast(X.data()), one, &R()); \ + if(!take_sqrt) R() = R()*R(); \ + } else { \ + Nrm2::nrm2(R,X,take_sqrt); \ + } \ + } \ +}; + +#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ +template \ +struct Nrm2< \ +Kokkos::View >, \ +Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ +1,true, ETI_SPEC_AVAIL > { \ + \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ + { \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast (INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast (numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasScnrm2(s.handle, N, reinterpret_cast(X.data()), one, &R()); \ + if(!take_sqrt) R() = R()*R(); \ + } else { \ + Nrm2::nrm2(R,X,take_sqrt); \ + } \ + } \ +}; + +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) + +} +} + +#endif + #endif diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 4587764caa..15f6bbc9eb 100644 --- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -66,15 +66,89 @@ struct gemm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +#if defined (KOKKOSKERNELS_INST_DOUBLE) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif + +#if defined (KOKKOSKERNELS_INST_DOUBLE) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( SCALAR , LAYOUTA, LAYOUTB, LAYOUTC, MEMSPACE ) \ +template \ +struct gemm_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > \ + > { enum : bool { value = true }; }; + +#if defined (KOKKOSKERNELS_INST_DOUBLE) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#endif + +#if defined (KOKKOSKERNELS_INST_DOUBLE) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_FLOAT) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +#endif +#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ + && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) + KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +#endif #endif } diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index cd74310e6c..8579103d95 100644 --- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -325,4 +325,303 @@ KOKKOSBLAS3_CGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::Layout } #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS3_DGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \ +template \ +struct GEMM< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > CViewType; \ + \ + static void \ + gemm (const char transA[], \ + const char transB[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, \ + const BViewType& B, \ + typename CViewType::const_value_type& beta, \ + const CViewType& C) { \ + \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,double]"); \ + const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \ + const int M = static_cast (C.extent(0)); \ + const int N = static_cast (C.extent(1)); \ + const int K = static_cast (A.extent(A_t?0:1)); \ + int strides[2]; \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + A.stride(strides); \ + const int LDA = strides[A_is_lr?0:1]; \ + B.stride(strides); \ + const int LDB = strides[B_is_lr?0:1]; \ + C.stride(strides); \ + const int LDC = strides[C_is_lr?0:1]; \ + \ + cublasOperation_t transa, transb; \ + if ((transA[0]=='N')||(transA[0]=='n')) \ + transa = CUBLAS_OP_N; \ + else if ((transA[0]=='T')||(transA[0]=='t')) \ + transa = CUBLAS_OP_T; \ + else \ + transa = CUBLAS_OP_C; \ + if ((transB[0]=='N')||(transB[0]=='n')) \ + transb = CUBLAS_OP_N; \ + else if ((transB[0]=='T')||(transB[0]=='t')) \ + transb = CUBLAS_OP_T; \ + else \ + transb = CUBLAS_OP_C; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + cublasDgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), LDA, B.data(), LDB, &beta, C.data(), LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + cublasDgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), LDB, A.data(), LDA, &beta, C.data(), LDC); \ + Kokkos::Profiling::popRegion(); \ + } \ +}; + +#define KOKKOSBLAS3_SGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \ +template \ +struct GEMM< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > CViewType; \ + \ + static void \ + gemm (const char transA[], \ + const char transB[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, \ + const BViewType& B, \ + typename CViewType::const_value_type& beta, \ + const CViewType& C) { \ + \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,float]"); \ + const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \ + const int M = static_cast (C.extent(0)); \ + const int N = static_cast (C.extent(1)); \ + const int K = static_cast (A.extent(A_t?0:1)); \ + int strides[2]; \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + A.stride(strides); \ + const int LDA = strides[A_is_lr?0:1]; \ + B.stride(strides); \ + const int LDB = strides[B_is_lr?0:1]; \ + C.stride(strides); \ + const int LDC = strides[C_is_lr?0:1]; \ + \ + cublasOperation_t transa, transb; \ + if ((transA[0]=='N')||(transA[0]=='n')) \ + transa = CUBLAS_OP_N; \ + else if ((transA[0]=='T')||(transA[0]=='t')) \ + transa = CUBLAS_OP_T; \ + else \ + transa = CUBLAS_OP_C; \ + if ((transB[0]=='N')||(transB[0]=='n')) \ + transb = CUBLAS_OP_N; \ + else if ((transB[0]=='T')||(transB[0]=='t')) \ + transb = CUBLAS_OP_T; \ + else \ + transb = CUBLAS_OP_C; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + cublasSgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), LDA, B.data(), LDB, &beta, C.data(), LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + cublasSgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), LDB, A.data(), LDA, &beta, C.data(), LDC); \ + Kokkos::Profiling::popRegion(); \ + } \ +}; + +#define KOKKOSBLAS3_ZGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \ +template \ +struct GEMM< \ + Kokkos::View**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTC, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > CViewType; \ + \ + static void \ + gemm (const char transA[], \ + const char transB[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, \ + const BViewType& B, \ + typename CViewType::const_value_type& beta, \ + const CViewType& C) { \ + \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,complex]"); \ + const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \ + const int M = static_cast (C.extent(0)); \ + const int N = static_cast (C.extent(1)); \ + const int K = static_cast (A.extent(A_t?0:1)); \ + int strides[2]; \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + A.stride(strides); \ + const int LDA = strides[A_is_lr?0:1]; \ + B.stride(strides); \ + const int LDB = strides[B_is_lr?0:1]; \ + C.stride(strides); \ + const int LDC = strides[C_is_lr?0:1]; \ + \ + cublasOperation_t transa, transb; \ + if ((transA[0]=='N')||(transA[0]=='n')) \ + transa = CUBLAS_OP_N; \ + else if ((transA[0]=='T')||(transA[0]=='t')) \ + transa = CUBLAS_OP_T; \ + else \ + transa = CUBLAS_OP_C; \ + if ((transB[0]=='N')||(transB[0]=='n')) \ + transb = CUBLAS_OP_N; \ + else if ((transB[0]=='T')||(transB[0]=='t')) \ + transb = CUBLAS_OP_T; \ + else \ + transb = CUBLAS_OP_C; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + cublasZgemm(s.handle, transa, transb, M, N, K, reinterpret_cast(&alpha), reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB, reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + cublasZgemm(s.handle, transb, transa, N, M, K, reinterpret_cast(&alpha), reinterpret_cast(B.data()), LDB, reinterpret_cast(A.data()), LDA, reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC); \ + Kokkos::Profiling::popRegion(); \ + } \ +}; \ + +#define KOKKOSBLAS3_CGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \ +template \ +struct GEMM< \ + Kokkos::View**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTC, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > CViewType; \ + \ + static void \ + gemm (const char transA[], \ + const char transB[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, \ + const BViewType& B, \ + typename CViewType::const_value_type& beta, \ + const CViewType& C) { \ + \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,complex]"); \ + const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \ + const int M = static_cast (C.extent(0)); \ + const int N = static_cast (C.extent(1)); \ + const int K = static_cast (A.extent(A_t?0:1)); \ + int strides[2]; \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + A.stride(strides); \ + const int LDA = strides[A_is_lr?0:1]; \ + B.stride(strides); \ + const int LDB = strides[B_is_lr?0:1]; \ + C.stride(strides); \ + const int LDC = strides[C_is_lr?0:1]; \ + \ + cublasOperation_t transa, transb; \ + if ((transA[0]=='N')||(transA[0]=='n')) \ + transa = CUBLAS_OP_N; \ + else if ((transA[0]=='T')||(transA[0]=='t')) \ + transa = CUBLAS_OP_T; \ + else \ + transa = CUBLAS_OP_C; \ + if ((transB[0]=='N')||(transB[0]=='n')) \ + transb = CUBLAS_OP_N; \ + else if ((transB[0]=='T')||(transB[0]=='t')) \ + transb = CUBLAS_OP_T; \ + else \ + transb = CUBLAS_OP_C; \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + cublasCgemm(s.handle, transa, transb, M, N, K, reinterpret_cast(&alpha), reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB, reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + cublasCgemm(s.handle, transb, transa, N, M, K, reinterpret_cast(&alpha), reinterpret_cast(B.data()), LDB, reinterpret_cast(A.data()), LDA, reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC); \ + Kokkos::Profiling::popRegion(); \ + } \ +}; + +KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true) +KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +} +} +#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS + #endif diff --git a/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp b/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp new file mode 100644 index 0000000000..91480f766b --- /dev/null +++ b/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp @@ -0,0 +1,3 @@ +#include +#include +#include \ No newline at end of file diff --git a/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp new file mode 100644 index 0000000000..a6fed196bd --- /dev/null +++ b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp @@ -0,0 +1,28 @@ +#ifndef KOKKOSBLAS_CUDA_TPL_HPP_ +#define KOKKOSBLAS_CUDA_TPL_HPP_ + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include + +namespace KokkosBlas { +namespace Impl { + +CudaBlasSingleton::CudaBlasSingleton() +{ + cublasStatus_t stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + Kokkos::abort("CUBLAS initialization failed\n"); + + Kokkos::push_finalize_hook ([&] () { + cublasDestroy(handle); + }); +} + +CudaBlasSingleton & CudaBlasSingleton::singleton() +{ static CudaBlasSingleton s ; return s ; } + +} +} +#endif + +#endif \ No newline at end of file diff --git a/src/impl/tpls/KokkosBlas_tpl_spec.hpp b/src/impl/tpls/KokkosBlas_tpl_spec.hpp new file mode 100644 index 0000000000..674867d92c --- /dev/null +++ b/src/impl/tpls/KokkosBlas_tpl_spec.hpp @@ -0,0 +1,23 @@ +#ifndef KOKKOSBLAS_TPL_SPEC_HPP_ +#define KOKKOSBLAS_TPL_SPEC_HPP_ + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include "cuda_runtime.h" +#include "cublas_v2.h" + +namespace KokkosBlas { +namespace Impl { + +struct CudaBlasSingleton { + cublasHandle_t handle; + + CudaBlasSingleton(); + + static CudaBlasSingleton & singleton(); +}; + +} +} +#endif + +#endif \ No newline at end of file