Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TPL Support for BLAS functions (nrm2, axpy, dot, gemm) using CuBLAS (Issue #247) #262

Merged
merged 13 commits into from
Jul 9, 2018
4 changes: 4 additions & 0 deletions Makefile.kokkos-kernels
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,10 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_INST_EXECSPACE_CUDA), 1)
endif
endif

ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_CUBLAS}, 1)
KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp
endif

KOKKOSKERNELS_INTERNAL_HEADERS = $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/*.hpp)
KOKKOSKERNELS_INTERNAL_HEADERS += $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/generated_specializations_hpp/*/*eti_spec*.hpp)

Expand Down
37 changes: 37 additions & 0 deletions src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,47 @@ struct axpby_tpl_spec_avail< \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1> { enum : bool { value = true }; };

#if defined (KOKKOSKERNELS_INST_DOUBLE)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif
#if defined (KOKKOSKERNELS_INST_FLOAT)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif

#endif

// cuBLAS
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS

#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( SCALAR , LAYOUT, MEMSPACE ) \
template<class ExecSpace> \
struct axpby_tpl_spec_avail< \
SCALAR, \
Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
SCALAR, \
Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1> { enum : bool { value = true }; };

#if defined (KOKKOSKERNELS_INST_DOUBLE)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't you use appropriate kokkos-kernels macros to detect whether those Scalar types (e.g., float) are enabled? Otherwise, you'll be instantiating for types for which the user did not want to instantiate. This will increase build time and library size.

#endif
#if defined (KOKKOSKERNELS_INST_FLOAT)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_)
KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
#endif

#endif
}
Expand Down
157 changes: 157 additions & 0 deletions src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,161 @@ KOKKOSBLAS1_CAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)

#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS

// cuBLAS
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
#include<KokkosBlas_tpl_spec.hpp>

namespace KokkosBlas {
namespace Impl {

#define KOKKOSBLAS1_DAXPBY_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
template<class ExecSpace> \
struct Axpby< \
double, \
Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
double, \
Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1, true, ETI_SPEC_AVAIL> { \
typedef double AV; \
typedef double BV; \
typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
typedef Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
typedef typename XV::size_type size_type; \
\
static void \
axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
const size_type numElems = X.extent(0); \
if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0)) { \
axpby_print_specialization<AV,XV,BV,YV>(); \
const int N = static_cast<int> (numElems); \
constexpr int one = 1; \
KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \
} else \
Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
} \
};


#define KOKKOSBLAS1_SAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
template<class ExecSpace> \
struct Axpby< \
float, \
Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
float, \
Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1, true, ETI_SPEC_AVAIL> { \
typedef float AV; \
typedef float BV; \
typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
typedef Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
typedef typename XV::size_type size_type; \
\
static void \
axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
const size_type numElems = X.extent(0); \
if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0f)) { \
axpby_print_specialization<AV,XV,BV,YV>(); \
const int N = static_cast<int> (numElems); \
constexpr int one = 1; \
KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \
} else \
Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
} \
};

#define KOKKOSBLAS1_ZAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
template<class ExecSpace> \
struct Axpby< \
Kokkos::complex<double>, \
Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
Kokkos::complex<double>, \
Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1, true, ETI_SPEC_AVAIL> { \
typedef Kokkos::complex<double> AV; \
typedef Kokkos::complex<double> BV; \
typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
typedef Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
typedef typename XV::size_type size_type; \
\
static void \
axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
const size_type numElems = X.extent(0); \
if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0f)) { \
axpby_print_specialization<AV,XV,BV,YV>(); \
const int N = static_cast<int> (numElems); \
constexpr int one = 1; \
KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
cublasZaxpy(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<const cuDoubleComplex*>(X.data()), one, reinterpret_cast<cuDoubleComplex*>(Y.data()), one); \
} else \
Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
} \
};

#define KOKKOSBLAS1_CAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
template<class ExecSpace> \
struct Axpby< \
Kokkos::complex<float>, \
Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
Kokkos::complex<float>, \
Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1, true, ETI_SPEC_AVAIL> { \
typedef Kokkos::complex<float> AV; \
typedef Kokkos::complex<float> BV; \
typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
typedef Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
typedef typename XV::size_type size_type; \
\
static void \
axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
const size_type numElems = X.extent(0); \
if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0f)) { \
axpby_print_specialization<AV,XV,BV,YV>(); \
const int N = static_cast<int> (numElems); \
constexpr int one = 1; \
KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
cublasCaxpy(s.handle, N, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(X.data()), one, reinterpret_cast<cuComplex*>(Y.data()), one); \
} else \
Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
} \
};

KOKKOSBLAS1_DAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
KOKKOSBLAS1_DAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)

KOKKOSBLAS1_SAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
KOKKOSBLAS1_SAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)

KOKKOSBLAS1_ZAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
KOKKOSBLAS1_ZAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)

KOKKOSBLAS1_CAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
KOKKOSBLAS1_CAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)

#undef KOKKOSBLAS1_DAXPBY_CUBLAS
#undef KOKKOSBLAS1_SAXPBY_CUBLAS
#undef KOKKOSBLAS1_ZAXPBY_CUBLAS
#undef KOKKOSBLAS1_CAXPBY_CUBLAS
}
}

#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS

#endif
36 changes: 36 additions & 0 deletions src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,47 @@ Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1,1> { enum : bool { value = true }; };

#if defined (KOKKOSKERNELS_INST_DOUBLE)
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif
#if defined (KOKKOSKERNELS_INST_FLOAT)
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_)
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_)
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>, Kokkos::LayoutLeft, Kokkos::HostSpace)
#endif

#endif

// cuBLAS
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
// double
#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \
template<class ExecSpace> \
struct dot_tpl_spec_avail< \
Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
1,1> { enum : bool { value = true }; };

#if defined (KOKKOSKERNELS_INST_DOUBLE)
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace)
#endif
#if defined (KOKKOSKERNELS_INST_FLOAT)
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How come the complex double and complex float macros end with underscores, but the double and float ones don't?

Copy link
Contributor Author

@vqd8a vqd8a Jun 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can use either without or with underscore, since in the KokkosKernels_config.h, it uses

#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
#define KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_
#endif

but I just followed the convention in hpp files in \src\impl\generated_specializations_hpp

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, just checking :)

KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
#endif
#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_)
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
#endif

#endif

Expand Down
Loading