From 041cd843eb8697ff9db09a6af3cf83340e49dfe8 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 30 Oct 2018 00:26:04 -0600 Subject: [PATCH 01/47] Add getrs implementation and unit tests --- .../KokkosBatched_InverseLU_Serial_Impl.hpp | 10 +- .../KokkosBatched_InverseLU_Team_Impl.hpp | 10 +- src/batched/KokkosBatched_SolveLU_Decl.hpp | 41 +++ .../KokkosBatched_SolveLU_Serial_Impl.hpp | 128 +++++++++ .../KokkosBatched_SolveLU_Team_Impl.hpp | 128 +++++++++ .../KokkosBatched_Trsm_Serial_Impl.hpp | 190 ++++++++++++++ src/batched/KokkosBatched_Trsm_Team_Impl.hpp | 92 +++++++ src/batched/KokkosBatched_Util.hpp | 3 +- unit_test/Makefile | 12 + .../batched/Test_Batched_SerialSolveLU.hpp | 229 ++++++++++++++++ .../Test_Batched_SerialSolveLU_Complex.hpp | 9 + .../Test_Batched_SerialSolveLU_Real.hpp | 20 ++ .../Test_Batched_SerialTrsm_Complex.hpp | 43 ++- .../batched/Test_Batched_SerialTrsm_Real.hpp | 42 +++ .../batched/Test_Batched_TeamSolveLU.hpp | 246 ++++++++++++++++++ .../Test_Batched_TeamSolveLU_Complex.hpp | 9 + .../batched/Test_Batched_TeamSolveLU_Real.hpp | 20 ++ .../batched/Test_Batched_TeamTrsm_Complex.hpp | 43 ++- .../batched/Test_Batched_TeamTrsm_Real.hpp | 42 +++ ...est_Cuda_Batched_SerialSolveLU_Complex.cpp | 3 + .../Test_Cuda_Batched_SerialSolveLU_Real.cpp | 3 + .../Test_Cuda_Batched_TeamSolveLU_Complex.cpp | 3 + .../Test_Cuda_Batched_TeamSolveLU_Real.cpp | 3 + ...t_OpenMP_Batched_SerialSolveLU_Complex.cpp | 3 + ...Test_OpenMP_Batched_SerialSolveLU_Real.cpp | 3 + ...est_OpenMP_Batched_TeamSolveLU_Complex.cpp | 3 + .../Test_OpenMP_Batched_TeamSolveLU_Real.cpp | 3 + ...t_Serial_Batched_SerialSolveLU_Complex.cpp | 3 + ...Test_Serial_Batched_SerialSolveLU_Real.cpp | 3 + ...est_Serial_Batched_TeamSolveLU_Complex.cpp | 3 + .../Test_Serial_Batched_TeamSolveLU_Real.cpp | 3 + 31 files changed, 1338 insertions(+), 15 deletions(-) create mode 100644 src/batched/KokkosBatched_SolveLU_Decl.hpp create mode 100644 src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp create mode 100644 src/batched/KokkosBatched_SolveLU_Team_Impl.hpp create mode 100644 unit_test/batched/Test_Batched_SerialSolveLU.hpp create mode 100644 unit_test/batched/Test_Batched_SerialSolveLU_Complex.hpp create mode 100644 unit_test/batched/Test_Batched_SerialSolveLU_Real.hpp create mode 100644 unit_test/batched/Test_Batched_TeamSolveLU.hpp create mode 100644 unit_test/batched/Test_Batched_TeamSolveLU_Complex.hpp create mode 100644 unit_test/batched/Test_Batched_TeamSolveLU_Real.hpp create mode 100644 unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Complex.cpp create mode 100644 unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Real.cpp create mode 100644 unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Complex.cpp create mode 100644 unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Real.cpp create mode 100644 unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Complex.cpp create mode 100644 unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Real.cpp create mode 100644 unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Complex.cpp create mode 100644 unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Real.cpp create mode 100644 unit_test/serial/Test_Serial_Batched_SerialSolveLU_Complex.cpp create mode 100644 unit_test/serial/Test_Serial_Batched_SerialSolveLU_Real.cpp create mode 100644 unit_test/serial/Test_Serial_Batched_TeamSolveLU_Complex.cpp create mode 100644 unit_test/serial/Test_Serial_Batched_TeamSolveLU_Real.cpp diff --git a/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp b/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp index 29dcf1c9f8..0615e928c9 100644 --- a/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp +++ b/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp @@ -5,8 +5,8 @@ /// \author Vinh Dang (vqdang@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Trsm_Decl.hpp" -#include "KokkosBatched_Trsm_Serial_Impl.hpp" +#include "KokkosBatched_SolveLU_Decl.hpp" +#include "KokkosBatched_SolveLU_Serial_Impl.hpp" namespace KokkosBatched { namespace Experimental { @@ -97,9 +97,8 @@ namespace KokkosBatched { } //First, compute L inverse by solving the system L*Linv = I for Linv - SerialTrsm::invoke(one, A, B); //Second, compute A inverse by solving the system U*Ainv = Linv for Ainv - SerialTrsm::invoke(one, A, B); + SerialSolveLU::invoke(A,B); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -140,9 +139,8 @@ namespace KokkosBatched { } //First, compute L inverse by solving the system L*Linv = I for Linv - SerialTrsm::invoke(one, A, B); //Second, compute A inverse by solving the system U*Ainv = Linv for Ainv - SerialTrsm::invoke(one, A, B); + SerialSolveLU::invoke(A,B); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll diff --git a/src/batched/KokkosBatched_InverseLU_Team_Impl.hpp b/src/batched/KokkosBatched_InverseLU_Team_Impl.hpp index 0544e005b2..30cc86767e 100644 --- a/src/batched/KokkosBatched_InverseLU_Team_Impl.hpp +++ b/src/batched/KokkosBatched_InverseLU_Team_Impl.hpp @@ -5,8 +5,8 @@ /// \author Vinh Dang (vqdang@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Trsm_Decl.hpp" -#include "KokkosBatched_Trsm_Team_Impl.hpp" +#include "KokkosBatched_SolveLU_Decl.hpp" +#include "KokkosBatched_SolveLU_Team_Impl.hpp" namespace KokkosBatched { namespace Experimental { @@ -43,9 +43,8 @@ namespace KokkosBatched { }); //First, compute L inverse by solving the system L*Linv = I for Linv - TeamTrsm::invoke(member, one, A, B); //Second, compute A inverse by solving the system U*Ainv = Linv for Ainv - TeamTrsm::invoke(member, one, A, B); + TeamSolveLU::invoke(member, A, B); Kokkos::parallel_for(Kokkos::TeamThreadRange(member,A.extent(0)*A.extent(1)),[&](const int &tid) { int i = tid/A.extent(1); @@ -82,9 +81,8 @@ namespace KokkosBatched { }); //First, compute L inverse by solving the system L*Linv = I for Linv - TeamTrsm::invoke(member, one, A, B); //Second, compute A inverse by solving the system U*Ainv = Linv for Ainv - TeamTrsm::invoke(member, one, A, B); + TeamSolveLU::invoke(member, A, B); Kokkos::parallel_for(Kokkos::TeamThreadRange(member,A.extent(0)*A.extent(1)),[&](const int &tid) { int i = tid/A.extent(1); diff --git a/src/batched/KokkosBatched_SolveLU_Decl.hpp b/src/batched/KokkosBatched_SolveLU_Decl.hpp new file mode 100644 index 0000000000..9c8b71a58d --- /dev/null +++ b/src/batched/KokkosBatched_SolveLU_Decl.hpp @@ -0,0 +1,41 @@ +#ifndef __KOKKOSBATCHED_SOLVELU_DECL_HPP__ +#define __KOKKOSBATCHED_SOLVELU_DECL_HPP__ + + +/// \author Vinh Dang (vqdang@sandia.gov) + +#include "KokkosBatched_Vector.hpp" + +namespace KokkosBatched { + namespace Experimental { + + template + struct SerialSolveLU { + // no piv version + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const AViewType &A, + const BViewType &B); + }; + + template + struct TeamSolveLU { + // no piv version + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const AViewType &A, + const BViewType &B); + }; + + } +} + +#endif diff --git a/src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp b/src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp new file mode 100644 index 0000000000..78fdc2a01c --- /dev/null +++ b/src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp @@ -0,0 +1,128 @@ +#ifndef __KOKKOSBATCHED_SOLVELU_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_SOLVELU_SERIAL_IMPL_HPP__ + + +/// \author Vinh Dang (vqdang@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Trsm_Serial_Impl.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Impl + /// ========= + + /// + /// SolveLU no piv + /// + + template<> + template + KOKKOS_INLINE_FUNCTION + int + SerialSolveLU:: + invoke(const AViewType &A, + const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= U*X) by solving the system L*Y = B for Y + SerialTrsm::invoke(one, A, B); + //Second, compute X by solving the system U*X = Y for X + SerialTrsm::invoke(one, A, B); + + return 0; + } + + template<> + template + KOKKOS_INLINE_FUNCTION + int + SerialSolveLU:: + invoke(const AViewType &A, + const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= U*X) by solving the system L*Y = B for Y + SerialTrsm::invoke(one, A, B); + //Second, compute X by solving the system U*X = Y for X + SerialTrsm::invoke(one, A, B); + + return 0; + } + + template<> + template + KOKKOS_INLINE_FUNCTION + int + SerialSolveLU:: + invoke(const AViewType &A, + const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= L'*X) by solving the system U'*Y = B for Y + SerialTrsm::invoke(one, A, B); + //Second, compute X by solving the system L'*X = Y for X + SerialTrsm::invoke(one, A, B); + + return 0; + } + + template<> + template + KOKKOS_INLINE_FUNCTION + int + SerialSolveLU:: + invoke(const AViewType &A, + const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= L'*X) by solving the system U'*Y = B for Y + SerialTrsm::invoke(one, A, B); + //Second, compute X by solving the system L'*X = Y for X + SerialTrsm::invoke(one, A, B); + + return 0; + } + + } +} + +#endif diff --git a/src/batched/KokkosBatched_SolveLU_Team_Impl.hpp b/src/batched/KokkosBatched_SolveLU_Team_Impl.hpp new file mode 100644 index 0000000000..061f2bcce5 --- /dev/null +++ b/src/batched/KokkosBatched_SolveLU_Team_Impl.hpp @@ -0,0 +1,128 @@ +#ifndef __KOKKOSBATCHED_SOLVELU_TEAM_IMPL_HPP__ +#define __KOKKOSBATCHED_SOLVELU_TEAM_IMPL_HPP__ + + +/// \author Vinh Dang (vqdang@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Trsm_Team_Impl.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Team Impl + /// ========= + + /// + /// SolveLU no piv + /// + + template + struct TeamSolveLU { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= U*X) by solving the system L*Y = B for Y + TeamTrsm::invoke(member, one, A, B); + //Second, compute X by solving the system U*X = Y for X + TeamTrsm::invoke(member, one, A, B); + + return 0; + } + }; + + template + struct TeamSolveLU { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= U*X) by solving the system L*Y = B for Y + TeamTrsm::invoke(member, one, A, B); + //Second, compute X by solving the system U*X = Y for X + TeamTrsm::invoke(member, one, A, B); + + return 0; + } + }; + + template + struct TeamSolveLU { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= L'*X) by solving the system U'*Y = B for Y + TeamTrsm::invoke(member, one, A, B); + //Second, compute X by solving the system L'*X = Y for X + TeamTrsm::invoke(member, one, A, B); + + return 0; + } + }; + + template + struct TeamSolveLU { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, const AViewType &A, const BViewType &B) { + static_assert(AViewType::rank == 2, "A should have two dimensions"); + static_assert((BViewType::rank == 1)||(BViewType::rank == 2), "B should have either one dimension or two dimensions"); + static_assert(std::is_same::value, "A and B should be on the same memory space"); + assert(A.extent(0)==A.extent(1)); + assert(A.extent(1)==B.extent(0)); + + typedef typename AViewType::value_type ScalarType; + + const ScalarType one(1.0); + + //First, compute Y (= L'*X) by solving the system U'*Y = B for Y + TeamTrsm::invoke(member, one, A, B); + //Second, compute X by solving the system L'*X = Y for X + TeamTrsm::invoke(member, one, A, B); + + return 0; + } + }; + + } +} + +#endif diff --git a/src/batched/KokkosBatched_Trsm_Serial_Impl.hpp b/src/batched/KokkosBatched_Trsm_Serial_Impl.hpp index 556b502e90..d08983d45b 100644 --- a/src/batched/KokkosBatched_Trsm_Serial_Impl.hpp +++ b/src/batched/KokkosBatched_Trsm_Serial_Impl.hpp @@ -297,6 +297,196 @@ namespace KokkosBatched { } }; + /// + /// L/L/T + /// + /// B := inv(tril(AT)) (alpha*B) + /// A(m x m), B(m x n) + +#if \ + defined(__KOKKOSBATCHED_INTEL_MKL__) && \ + defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) && \ + defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) + template + struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + typedef typename BViewType::value_type vector_type; + //typedef typename vector_type::value_type value_type; + + const int + m = B.dimension(0), + n = B.dimension(1); + + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + + // no error check + int r_val = 0; + if (A.stride_0() == 1 && B.stride_0() == 1) { + mkl_dtrsm_compact(MKL_COL_MAJOR, + MKL_LEFT, MKL_LOWER, MKL_TRANS, + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, + alpha, + (const double*)A.data(), A.stride_1(), + (double*)B.data(), B.stride_1(), + format, (MKL_INT)vector_type::vector_length); + } else if (A.stride_1() == 1 && B.stride_1() == 1) { + mkl_dtrsm_compact(MKL_ROW_MAJOR, + MKL_LEFT, MKL_LOWER, MKL_TRANS, + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, + alpha, + (const double*)A.data(), A.stride_0(), + (double*)B.data(), B.stride_0(), + format, (MKL_INT)vector_type::vector_length); + } else { + r_val = -1; + } + return r_val; + } + }; +#endif + + template + struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + + template + struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke(ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + /// + /// L/U/NT + /// + /// B := inv(triu(AT)) (alpha*B) + /// A(m x m), B(m x n) +#if \ + defined(__KOKKOSBATCHED_INTEL_MKL__) && \ + defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) && \ + defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) + template + struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + typedef typename BViewType::value_type vector_type; + //typedef typename vector_type::value_type value_type; + + const int + m = B.dimension(0), + n = B.dimension(1); + + static_assert(is_vector::value, "value type is not vector type"); + static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, + "AVX, AVX2 and AVX512 is supported"); + const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX; + + // no error check + int r_val = 0; + if (A.stride_0() == 1 && B.stride_0() == 1) { + mkl_dtrsm_compact(MKL_COL_MAJOR, + MKL_LEFT, MKL_UPPER, MKL_TRANS, + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, + alpha, + (const double*)A.data(), A.stride_1(), + (double*)B.data(), B.stride_1(), + format, (MKL_INT)vector_type::vector_length); + } else if (A.stride_1() == 1 && B.stride_1() == 1) { + mkl_dtrsm_compact(MKL_ROW_MAJOR, + MKL_LEFT, MKL_UPPER, MKL_TRANS, + ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, + m, n, + alpha, + (const double*)A.data(), A.stride_0(), + (double*)B.data(), B.stride_0(), + format, (MKL_INT)vector_type::vector_length); + } else { + r_val = -1; + } + return r_val; + } + }; +#endif + + template + struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + + template + struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftUpper::invoke(ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + } } diff --git a/src/batched/KokkosBatched_Trsm_Team_Impl.hpp b/src/batched/KokkosBatched_Trsm_Team_Impl.hpp index 57bb2c5e45..a2f6e18c0b 100644 --- a/src/batched/KokkosBatched_Trsm_Team_Impl.hpp +++ b/src/batched/KokkosBatched_Trsm_Team_Impl.hpp @@ -152,6 +152,98 @@ namespace KokkosBatched { } }; + /// + /// L/L/T + /// + /// B := inv(tril(AT)) (alpha*B) + /// A(m x m), B(m x n) + + template + struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftLower::invoke(member, + ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + + template + struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftLower::invoke(member, + ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + + /// + /// L/U/T + /// + /// B := inv(triu(AT)) (alpha*B) + /// A(m x m), B(m x n) + + template + struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftUpper::invoke(member, + ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + + template + struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftUpper::invoke(member, + ArgDiag::use_unit_diag, + B.extent(0), B.extent(1), + alpha, + A.data(), A.stride_1(), A.stride_0(), + B.data(), B.stride_0(), B.stride_1()); + } + }; + } } diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 482bbc9536..bf28ac87fa 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -273,7 +273,8 @@ namespace KokkosBatched { using Gemm = Level3; using Trsm = Level3; using LU = Level3; - using InverseLU = Level3; + using InverseLU = Level3; + using SolveLU = Level3; struct Level2 { struct Unblocked {}; diff --git a/unit_test/Makefile b/unit_test/Makefile index a7c638a6d9..62996ffb85 100644 --- a/unit_test/Makefile +++ b/unit_test/Makefile @@ -143,6 +143,8 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_OPENMP), 1) OBJ_OPENMP += Test_OpenMP_Batched_TeamTrsv_Real.o OBJ_OPENMP += Test_OpenMP_Batched_SerialInverseLU_Real.o OBJ_OPENMP += Test_OpenMP_Batched_TeamInverseLU_Real.o + OBJ_OPENMP += Test_OpenMP_Batched_SerialSolveLU_Real.o + OBJ_OPENMP += Test_OpenMP_Batched_TeamSolveLU_Real.o # Complex OBJ_OPENMP += Test_OpenMP_Batched_SerialMatUtil_Complex.o OBJ_OPENMP += Test_OpenMP_Batched_SerialGemm_Complex.o @@ -158,6 +160,8 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_OPENMP), 1) OBJ_OPENMP += Test_OpenMP_Batched_TeamTrsv_Complex.o OBJ_OPENMP += Test_OpenMP_Batched_SerialInverseLU_Complex.o OBJ_OPENMP += Test_OpenMP_Batched_TeamInverseLU_Complex.o + OBJ_OPENMP += Test_OpenMP_Batched_SerialSolveLU_Complex.o + OBJ_OPENMP += Test_OpenMP_Batched_TeamSolveLU_Complex.o # Vector OBJ_OPENMP += Test_OpenMP_Batched_VectorArithmatic.o OBJ_OPENMP += Test_OpenMP_Batched_VectorMath.o @@ -249,6 +253,8 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_CUDA), 1) OBJ_CUDA += Test_Cuda_Batched_TeamTrsv_Real.o OBJ_CUDA += Test_Cuda_Batched_SerialInverseLU_Real.o OBJ_CUDA += Test_Cuda_Batched_TeamInverseLU_Real.o + OBJ_CUDA += Test_Cuda_Batched_SerialSolveLU_Real.o + OBJ_CUDA += Test_Cuda_Batched_TeamSolveLU_Real.o # Complex OBJ_CUDA += Test_Cuda_Batched_SerialMatUtil_Complex.o OBJ_CUDA += Test_Cuda_Batched_SerialGemm_Complex.o @@ -264,6 +270,8 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_CUDA), 1) OBJ_CUDA += Test_Cuda_Batched_TeamTrsv_Complex.o OBJ_CUDA += Test_Cuda_Batched_SerialInverseLU_Complex.o OBJ_CUDA += Test_Cuda_Batched_TeamInverseLU_Complex.o + OBJ_CUDA += Test_Cuda_Batched_SerialSolveLU_Complex.o + OBJ_CUDA += Test_Cuda_Batched_TeamSolveLU_Complex.o TARGETS += KokkosKernels_UnitTest_Cuda TEST_TARGETS += test-cuda endif @@ -350,6 +358,8 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_SERIAL), 1) OBJ_SERIAL += Test_Serial_Batched_TeamTrsv_Real.o OBJ_SERIAL += Test_Serial_Batched_SerialInverseLU_Real.o OBJ_SERIAL += Test_Serial_Batched_TeamInverseLU_Real.o + OBJ_SERIAL += Test_Serial_Batched_SerialSolveLU_Real.o + OBJ_SERIAL += Test_Serial_Batched_TeamSolveLU_Real.o # Complex OBJ_SERIAL += Test_Serial_Batched_SerialMatUtil_Complex.o OBJ_SERIAL += Test_Serial_Batched_SerialGemm_Complex.o @@ -365,6 +375,8 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_SERIAL), 1) OBJ_SERIAL += Test_Serial_Batched_TeamTrsv_Complex.o OBJ_SERIAL += Test_Serial_Batched_SerialInverseLU_Complex.o OBJ_SERIAL += Test_Serial_Batched_TeamInverseLU_Complex.o + OBJ_SERIAL += Test_Serial_Batched_SerialSolveLU_Complex.o + OBJ_SERIAL += Test_Serial_Batched_TeamSolveLU_Complex.o # Vector OBJ_SERIAL += Test_Serial_Batched_VectorArithmatic.o OBJ_SERIAL += Test_Serial_Batched_VectorMath.o diff --git a/unit_test/batched/Test_Batched_SerialSolveLU.hpp b/unit_test/batched/Test_Batched_SerialSolveLU.hpp new file mode 100644 index 0000000000..a482eafbee --- /dev/null +++ b/unit_test/batched/Test_Batched_SerialSolveLU.hpp @@ -0,0 +1,229 @@ +/// \author Vinh Dang (vqdang@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +//#include "KokkosBatched_Vector.hpp" + +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_LU_Serial_Impl.hpp" +#include "KokkosBatched_SolveLU_Decl.hpp" +#include "KokkosBatched_SolveLU_Serial_Impl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched::Experimental; + +namespace Test { + + template + struct ParamTag { + typedef TA transA; + typedef TB transB; + }; + + template + struct Functor_BatchedSerialGemm { + ViewType _a, _b, _c; + + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, + const ViewType &a, + const ViewType &b, + const ScalarType beta, + const ViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + for (int i=0;i(aa.extent(0));++i) + aa(i,i) += 10.0; + + SerialGemm:: + invoke(_alpha, aa, bb, _beta, cc); + } + + inline + void run() { + Kokkos::RangePolicy policy(0, _c.extent(0)); + Kokkos::parallel_for(policy, *this); + } + }; + + template + struct Functor_BatchedSerialLU { + ViewType _a; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialLU(const ViewType &a) + : _a(a) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + + for (int i=0;i(aa.extent(0));++i) + aa(i,i) += 10.0; + + SerialLU::invoke(aa); + } + + inline + void run() { + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(policy, *this); + } + }; + + template + struct Functor_TestBatchedSerialSolveLU { + ViewType _a; + ViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedSerialSolveLU(const ViewType &a, const ViewType &b) + : _a(a), _b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + + SerialSolveLU::invoke(aa,bb); + } + + inline + void run() { + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(policy, *this); + } + }; + + template + void impl_test_batched_solvelu(const int N, const int BlkSize) { + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// randomized input testing views + ViewType a0("a0", N, BlkSize, BlkSize); + ViewType a1("a1", N, BlkSize, BlkSize); + ViewType b ("b", N, BlkSize, 5 ); + ViewType x0("x0", N, BlkSize, 5 ); + ViewType a0_T("a0_T", N, BlkSize, BlkSize); + ViewType b_T ("b_T", N, BlkSize, 5 ); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(a0, random, value_type(1.0)); + Kokkos::fill_random(x0, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a1, a0); + Kokkos::deep_copy(a0_T, a0); + + value_type alpha = 1.0, beta = 0.0; + typedef ParamTag param_tag_type; + + Functor_BatchedSerialGemm(alpha, a0, x0, beta, b).run(); + + Functor_BatchedSerialLU(a1).run(); + + Functor_TestBatchedSerialSolveLU(a1,b).run(); + + Kokkos::fence(); + + //Transpose + typedef ParamTag param_tag_type_T; + + Functor_BatchedSerialGemm(alpha, a0_T, x0, beta, b_T).run(); + + Functor_TestBatchedSerialSolveLU(a1,b_T).run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror x0_host = Kokkos::create_mirror_view(x0); + typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); + typename ViewType::HostMirror b_T_host = Kokkos::create_mirror_view(b_T); + + Kokkos::deep_copy(x0_host, x0); + Kokkos::deep_copy(b_host, b); + Kokkos::deep_copy(b_T_host, b_T); + + /// check x0 = b ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int k=0;k +int test_batched_solvelu() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_solvelu( 0, 10); + for (int i=0;i<10;++i) { + Test::impl_test_batched_solvelu(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_solvelu( 0, 10); + for (int i=0;i<10;++i) { + Test::impl_test_batched_solvelu(1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_SerialSolveLU_Complex.hpp b/unit_test/batched/Test_Batched_SerialSolveLU_Complex.hpp new file mode 100644 index 0000000000..67af48c726 --- /dev/null +++ b/unit_test/batched/Test_Batched_SerialSolveLU_Complex.hpp @@ -0,0 +1,9 @@ + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F( TestCategory, batched_scalar_serial_solvelu_dcomplex ) { + //printf("Batched serial solveLU - double complex - algorithm type: Unblocked\n"); + test_batched_solvelu,Algo::SolveLU::Unblocked>(); + //printf("Batched serial solveLU - double complex - algorithm type: Blocked\n"); + test_batched_solvelu,Algo::SolveLU::Blocked>(); +} +#endif diff --git a/unit_test/batched/Test_Batched_SerialSolveLU_Real.hpp b/unit_test/batched/Test_Batched_SerialSolveLU_Real.hpp new file mode 100644 index 0000000000..1a551093ac --- /dev/null +++ b/unit_test/batched/Test_Batched_SerialSolveLU_Real.hpp @@ -0,0 +1,20 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_serial_solvelu_float ) { + //printf("Batched serial solveLU - float - algorithm type: Unblocked\n"); + test_batched_solvelu(); + //printf("Batched serial solveLU - float - algorithm type: Blocked\n"); + test_batched_solvelu(); +} +#endif + + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_serial_solvelu_double ) { + //printf("Batched serial solveLU - double - algorithm type: Unblocked\n"); + test_batched_solvelu(); + //printf("Batched serial solveLU - double - algorithm type: Blocked\n"); + test_batched_solvelu(); +} +#endif + diff --git a/unit_test/batched/Test_Batched_SerialTrsm_Complex.hpp b/unit_test/batched/Test_Batched_SerialTrsm_Complex.hpp index 9b1c6582c0..57c1ec3509 100644 --- a/unit_test/batched/Test_Batched_SerialTrsm_Complex.hpp +++ b/unit_test/batched/Test_Batched_SerialTrsm_Complex.hpp @@ -31,7 +31,27 @@ TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); } - +// +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +} +// TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_dcomplex ) { +// typedef ::Test::ParamTag param_tag_type; +// typedef Algo::Trsm::Blocked algo_tag_type; +// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// } TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double ) { @@ -64,4 +84,25 @@ TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_double ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm,double,param_tag_type,algo_tag_type>(); } +// +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,double,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,double,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,double,param_tag_type,algo_tag_type>(); +} +// TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_double ) { +// typedef ::Test::ParamTag param_tag_type; +// typedef Algo::Trsm::Blocked algo_tag_type; +// test_batched_trsm,double,param_tag_type,algo_tag_type>(); +// } #endif diff --git a/unit_test/batched/Test_Batched_SerialTrsm_Real.hpp b/unit_test/batched/Test_Batched_SerialTrsm_Real.hpp index 4c3b9b42dd..e83c4f0816 100644 --- a/unit_test/batched/Test_Batched_SerialTrsm_Real.hpp +++ b/unit_test/batched/Test_Batched_SerialTrsm_Real.hpp @@ -30,6 +30,27 @@ TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_float_float ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } +// +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} #endif @@ -64,5 +85,26 @@ TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } +// +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} #endif diff --git a/unit_test/batched/Test_Batched_TeamSolveLU.hpp b/unit_test/batched/Test_Batched_TeamSolveLU.hpp new file mode 100644 index 0000000000..22ce33721e --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamSolveLU.hpp @@ -0,0 +1,246 @@ +/// \author Vinh Dang (vqdang@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +//#include "KokkosBatched_Vector.hpp" + +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Team_Impl.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_LU_Team_Impl.hpp" +#include "KokkosBatched_SolveLU_Decl.hpp" +#include "KokkosBatched_SolveLU_Team_Impl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBatched::Experimental; + +namespace Test { + + template + struct ParamTag { + typedef TA transA; + typedef TB transB; + }; + + template + struct Functor_BatchedTeamGemm { + ViewType _a, _b, _c; + + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedTeamGemm(const ScalarType alpha, + const ViewType &a, + const ViewType &b, + const ScalarType beta, + const ViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const MemberType &member) const { + const int k = member.league_rank(); + + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + if (member.team_rank() == 0) { + for (int i=0;i(aa.extent(0));++i) + aa(i,i) += 10.0; + } + member.team_barrier(); + + TeamGemm:: + invoke(member, _alpha, aa, bb, _beta, cc); + } + + inline + void run() { + const int league_size = _c.extent(0); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::parallel_for(policy, *this); + } + }; + + template + struct Functor_BatchedTeamLU { + ViewType _a; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedTeamLU(const ViewType &a) + : _a(a) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + const int k = member.league_rank(); + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + + if (member.team_rank() == 0) { + for (int i=0;i(aa.extent(0));++i) + aa(i,i) += 10.0; + } + member.team_barrier(); + + TeamLU::invoke(member, aa); + } + + inline + void run() { + const int league_size = _a.extent(0); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::parallel_for(policy, *this); + } + }; + + template + struct Functor_TestBatchedTeamSolveLU { + ViewType _a; + ViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamSolveLU(const ViewType &a, const ViewType &b) + : _a(a), _b(b) {} + + template + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + const int k = member.league_rank(); + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamSolveLU::invoke(member, aa, bb); + } + + inline + void run() { + const int league_size = _a.extent(0); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::parallel_for(policy, *this); + } + }; + + template + void impl_test_batched_solvelu(const int N, const int BlkSize) { + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// randomized input testing views + ViewType a0("a0", N, BlkSize, BlkSize); + ViewType a1("a1", N, BlkSize, BlkSize); + ViewType b ("b", N, BlkSize, 5 ); + ViewType x0("x0", N, BlkSize, 5 ); + ViewType a0_T("a0_T", N, BlkSize, BlkSize); + ViewType b_T ("b_T", N, BlkSize, 5 ); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(a0, random, value_type(1.0)); + Kokkos::fill_random(x0, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(a1, a0); + Kokkos::deep_copy(a0_T, a0); + + value_type alpha = 1.0, beta = 0.0; + typedef ParamTag param_tag_type; + + Functor_BatchedTeamGemm(alpha, a0, x0, beta, b).run(); + + Functor_BatchedTeamLU(a1).run(); + + Functor_TestBatchedTeamSolveLU(a1,b).run(); + + Kokkos::fence(); + + //Transpose + typedef ParamTag param_tag_type_T; + + Functor_BatchedTeamGemm(alpha, a0_T, x0, beta, b_T).run(); + + Functor_TestBatchedTeamSolveLU(a1,b_T).run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror x0_host = Kokkos::create_mirror_view(x0); + typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); + typename ViewType::HostMirror b_T_host = Kokkos::create_mirror_view(b_T); + + Kokkos::deep_copy(x0_host, x0); + Kokkos::deep_copy(b_host, b); + Kokkos::deep_copy(b_T_host, b_T); + + /// check x0 = b ; this eps is about 10^-14 + typedef typename ats::mag_type mag_type; + mag_type sum(1), diff(0); + const mag_type eps = 1.0e3 * ats::epsilon(); + + for (int k=0;k +int test_batched_solvelu() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_solvelu( 0, 10); + for (int i=0;i<10;++i) { + Test::impl_test_batched_solvelu(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View ViewType; + Test::impl_test_batched_solvelu( 0, 10); + for (int i=0;i<10;++i) { + Test::impl_test_batched_solvelu(1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/Test_Batched_TeamSolveLU_Complex.hpp b/unit_test/batched/Test_Batched_TeamSolveLU_Complex.hpp new file mode 100644 index 0000000000..f430654aba --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamSolveLU_Complex.hpp @@ -0,0 +1,9 @@ + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_solvelu_dcomplex ) { + //printf("Batched team solveLU - double complex - algorithm type: Unblocked\n"); + test_batched_solvelu,Algo::SolveLU::Unblocked>(); + //printf("Batched team solveLU - double complex - algorithm type: Blocked\n"); + test_batched_solvelu,Algo::SolveLU::Blocked>(); +} +#endif diff --git a/unit_test/batched/Test_Batched_TeamSolveLU_Real.hpp b/unit_test/batched/Test_Batched_TeamSolveLU_Real.hpp new file mode 100644 index 0000000000..7d268feabd --- /dev/null +++ b/unit_test/batched/Test_Batched_TeamSolveLU_Real.hpp @@ -0,0 +1,20 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F( TestCategory, batched_scalar_team_solvelu_float ) { + //printf("Batched team solveLU - float - algorithm type: Unblocked\n"); + test_batched_solvelu(); + //printf("Batched team solveLU - float - algorithm type: Blocked\n"); + test_batched_solvelu(); +} +#endif + + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F( TestCategory, batched_scalar_team_solvelu_double ) { + //printf("Batched team solveLU - double - algorithm type: Unblocked\n"); + test_batched_solvelu(); + //printf("Batched team solveLU - double - algorithm type: Blocked\n"); + test_batched_solvelu(); +} +#endif + diff --git a/unit_test/batched/Test_Batched_TeamTrsm_Complex.hpp b/unit_test/batched/Test_Batched_TeamTrsm_Complex.hpp index 8312980a2a..ebaa846693 100644 --- a/unit_test/batched/Test_Batched_TeamTrsm_Complex.hpp +++ b/unit_test/batched/Test_Batched_TeamTrsm_Complex.hpp @@ -31,7 +31,27 @@ TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_dcomplex ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); } - +// +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_dcomplex ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +} TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_double ) { @@ -64,4 +84,25 @@ TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm,double,param_tag_type,algo_tag_type>(); } +// +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,double,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,double,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,double,param_tag_type,algo_tag_type>(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm,double,param_tag_type,algo_tag_type>(); +} #endif diff --git a/unit_test/batched/Test_Batched_TeamTrsm_Real.hpp b/unit_test/batched/Test_Batched_TeamTrsm_Real.hpp index a032785df4..99d33e9a25 100644 --- a/unit_test/batched/Test_Batched_TeamTrsm_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamTrsm_Real.hpp @@ -30,6 +30,27 @@ TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } +// +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} #endif @@ -64,5 +85,26 @@ TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double ) { typedef Algo::Trsm::Blocked algo_tag_type; test_batched_trsm(); } +// +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} +TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double ) { + typedef ::Test::ParamTag param_tag_type; + typedef Algo::Trsm::Blocked algo_tag_type; + test_batched_trsm(); +} #endif diff --git a/unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Complex.cpp b/unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Complex.cpp new file mode 100644 index 0000000000..9062a92088 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Real.cpp b/unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Real.cpp new file mode 100644 index 0000000000..f4ea1ce387 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_SerialSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Real.hpp" diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Complex.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Complex.cpp new file mode 100644 index 0000000000..15936ff8a3 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Complex.hpp" diff --git a/unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Real.cpp b/unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Real.cpp new file mode 100644 index 0000000000..c9afe20df7 --- /dev/null +++ b/unit_test/cuda/Test_Cuda_Batched_TeamSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Cuda.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Real.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Complex.cpp b/unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Complex.cpp new file mode 100644 index 0000000000..77f2ba9774 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Real.cpp b/unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Real.cpp new file mode 100644 index 0000000000..2b83c07f6b --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_SerialSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Real.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Complex.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Complex.cpp new file mode 100644 index 0000000000..24cd27a958 --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Complex.hpp" diff --git a/unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Real.cpp b/unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Real.cpp new file mode 100644 index 0000000000..8d6ad6fc2c --- /dev/null +++ b/unit_test/openmp/Test_OpenMP_Batched_TeamSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_OpenMP.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Real.hpp" diff --git a/unit_test/serial/Test_Serial_Batched_SerialSolveLU_Complex.cpp b/unit_test/serial/Test_Serial_Batched_SerialSolveLU_Complex.cpp new file mode 100644 index 0000000000..da661bc895 --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_SerialSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/unit_test/serial/Test_Serial_Batched_SerialSolveLU_Real.cpp b/unit_test/serial/Test_Serial_Batched_SerialSolveLU_Real.cpp new file mode 100644 index 0000000000..89f62ee470 --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_SerialSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_SerialSolveLU.hpp" +#include "Test_Batched_SerialSolveLU_Real.hpp" diff --git a/unit_test/serial/Test_Serial_Batched_TeamSolveLU_Complex.cpp b/unit_test/serial/Test_Serial_Batched_TeamSolveLU_Complex.cpp new file mode 100644 index 0000000000..56e55cede8 --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_TeamSolveLU_Complex.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Complex.hpp" diff --git a/unit_test/serial/Test_Serial_Batched_TeamSolveLU_Real.cpp b/unit_test/serial/Test_Serial_Batched_TeamSolveLU_Real.cpp new file mode 100644 index 0000000000..bc30df6355 --- /dev/null +++ b/unit_test/serial/Test_Serial_Batched_TeamSolveLU_Real.cpp @@ -0,0 +1,3 @@ +#include "Test_Serial.hpp" +#include "Test_Batched_TeamSolveLU.hpp" +#include "Test_Batched_TeamSolveLU_Real.hpp" From ec3142ed29a2633d75e802bd31159ad90a63a8ba Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 5 Nov 2018 10:29:28 -0700 Subject: [PATCH 02/47] Fix runtime error with complex double and Intel compiler 18.2.199 --- unit_test/batched/Test_Batched_SerialInverseLU.hpp | 2 +- unit_test/batched/Test_Batched_SerialInverseLU_Complex.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test/batched/Test_Batched_SerialInverseLU.hpp b/unit_test/batched/Test_Batched_SerialInverseLU.hpp index 984d71b352..f195349216 100644 --- a/unit_test/batched/Test_Batched_SerialInverseLU.hpp +++ b/unit_test/batched/Test_Batched_SerialInverseLU.hpp @@ -142,7 +142,7 @@ namespace Test { Kokkos::deep_copy(w, value_type(0.0)); Functor_BatchedSerialLU(a1).run(); - + Functor_TestBatchedSerialInverseLU(a1,w).run(); value_type alpha = 1.0, beta = 0.0; diff --git a/unit_test/batched/Test_Batched_SerialInverseLU_Complex.hpp b/unit_test/batched/Test_Batched_SerialInverseLU_Complex.hpp index 18695abc57..ac3fc14456 100644 --- a/unit_test/batched/Test_Batched_SerialInverseLU_Complex.hpp +++ b/unit_test/batched/Test_Batched_SerialInverseLU_Complex.hpp @@ -4,6 +4,6 @@ TEST_F( TestCategory, batched_scalar_serial_inverselu_dcomplex ) { //printf("Batched serial inverse LU - double complex - algorithm type: Unblocked\n"); test_batched_inverselu,Algo::InverseLU::Unblocked>(); //printf("Batched serial inverse LU - double complex - algorithm type: Blocked\n"); - test_batched_inverselu,Algo::InverseLU::Blocked>(); + test_batched_inverselu,Algo::InverseLU::Blocked>(); } #endif From 8dcd9725cc0dbd1422c92f3a22f57b9f70b8a8cb Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 5 Nov 2018 10:30:11 -0700 Subject: [PATCH 03/47] Fix runtime error with complex double and Intel compiler 18.2.199 --- src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp | 2 +- src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp | 2 +- src/batched/KokkosBatched_Trsm_Serial_Internal.hpp | 13 ++++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp b/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp index 0615e928c9..8a815de407 100644 --- a/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp +++ b/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp @@ -88,7 +88,7 @@ namespace KokkosBatched { auto B = Kokkos::View >(W.data(), A.extent(0), A.extent(1)); const ScalarType one(1.0); - + #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif diff --git a/src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp b/src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp index 78fdc2a01c..1a5e18c369 100644 --- a/src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp +++ b/src/batched/KokkosBatched_SolveLU_Serial_Impl.hpp @@ -40,7 +40,7 @@ namespace KokkosBatched { SerialTrsm::invoke(one, A, B); //Second, compute X by solving the system U*X = Y for X SerialTrsm::invoke(one, A, B); - + return 0; } diff --git a/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp b/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp index 00bbe1948e..ec344309ce 100644 --- a/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp @@ -180,8 +180,8 @@ namespace KokkosBatched { const int iend = p, jend = n; const ValueType *__restrict__ a01 = A+p*as1; - /**/ ValueType *__restrict__ b1t = B+p*bs0; - + ValueType *__restrict__ b1t = B+p*bs0; + if (!use_unit_diag) { const ValueType alpha11 = A[p*as0+p*as1]; @@ -191,13 +191,16 @@ namespace KokkosBatched { for (int j=0;j0){ + for (int i=0;i Date: Tue, 6 Nov 2018 11:38:24 -0700 Subject: [PATCH 04/47] Disable Pthreads in Gtest This relates to the issue in Trilinos where serial builds were failing because they tried to load the pthreads library as requested by gtest in KokkosKernels. See: https://github.com/trilinos/Trilinos/issues/3773 --- unit_test/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index fe0b3d5ba0..3155d6d0a6 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -21,6 +21,8 @@ INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_ # the following relative path does not work or users should put kokkoskernels and kokkos # at the same place SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/../kokkos/tpls/gtest) +# Disables pthreads, this is a problem for serial builds in Trilinos & Sierra if it's enabled. +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGTEST_HAS_PTHREAD=0") INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) From 806bb5dccc1e86afbc768ca994b774392eb86e38 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 9 Nov 2018 01:31:30 -0700 Subject: [PATCH 05/47] Add a note for IF check workaround --- src/batched/KokkosBatched_Trsm_Serial_Internal.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp b/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp index ec344309ce..eb9ca3a3eb 100644 --- a/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Trsm_Serial_Internal.hpp @@ -192,7 +192,7 @@ namespace KokkosBatched { b1t[j*bs1] = b1t[j*bs1] / alpha11; } - if (p>0){ + if (p>0){//Note: A workaround to produce correct results for complex with Intel-18.2.199 for (int i=0;i Date: Wed, 21 Nov 2018 14:03:38 -0700 Subject: [PATCH 06/47] CMakeLists.txt Fix for case CUBLAS=ON but BLAS not explicitly ON Address issue #347 reported by @csiefer2 --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27ab894f71..046ee71e39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -298,6 +298,10 @@ IF (TPL_ENABLE_MKL) ENDIF() IF(${Kokkos_ENABLE_Cuda}) + IF (KOKKOSKERNELS_ENABLE_TPL_BLAS) + SET(KOKKOSKERNELS_ENABLE_TPL_BLAS ON) + LIST(APPEND TPL_LIST "BLAS") + ENDIF() # CUBLAS is ON by default when CUDA is enabled SET(KOKKOSKERNELS_ENABLE_TPL_CUBLAS ON) # Tribit provides TPL mechanism for CUSPARSE; thus, use it From ade852d157c6d7372e3d64380dc4cce20124e5ad Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 21 Nov 2018 14:09:38 -0700 Subject: [PATCH 07/47] CMakeLists.txt: fix typo in BLAS check --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 046ee71e39..bb8b7757ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -298,7 +298,7 @@ IF (TPL_ENABLE_MKL) ENDIF() IF(${Kokkos_ENABLE_Cuda}) - IF (KOKKOSKERNELS_ENABLE_TPL_BLAS) + IF (NOT KOKKOSKERNELS_ENABLE_TPL_BLAS) SET(KOKKOSKERNELS_ENABLE_TPL_BLAS ON) LIST(APPEND TPL_LIST "BLAS") ENDIF() From c1e6d13a8d0f038a983564dd0784e1d977d5a536 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 5 Dec 2018 14:19:45 -0700 Subject: [PATCH 08/47] generate_makefile: update to support setting cxxstandard Select cxx standard with --cxxstandard=c++** flag. --- scripts/generate_makefile.bash | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/generate_makefile.bash b/scripts/generate_makefile.bash index 82624cbfdf..ed10038d03 100755 --- a/scripts/generate_makefile.bash +++ b/scripts/generate_makefile.bash @@ -75,6 +75,9 @@ do --cxxflags*) CXXFLAGS="${key#*=}" ;; + --cxxstandard*) + KOKKOS_CXX_STANDARD="${key#*=}" + ;; --ldflags*) LDFLAGS="${key#*=}" ;; @@ -193,6 +196,8 @@ do echo " build. This will still set certain required" echo " flags via KOKKOS_CXXFLAGS (such as -fopenmp," echo " --std=c++11, etc.)." + echo "--cxxstandard=[FLAGS] Overwrite KOKKOS_CXX_STANDARD for library build and test" + echo " c++11 (default), c++14, c++17, c++1y, c++1z, c++2a" echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test" echo " build. This will still set certain required" echo " flags via KOKKOS_LDFLAGS (such as -fopenmp," @@ -278,6 +283,10 @@ if [ ${#CXXFLAGS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\"" fi +if [ ${#KOKKOS_CXX_STANDARD} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CXX_STANDARD=\"${KOKKOS_CXX_STANDARD}\"" +fi + if [ ${#LDFLAGS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\"" fi From 84e246693b57f04171a48135f21df73a68625f24 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 5 Dec 2018 14:22:34 -0700 Subject: [PATCH 09/47] test_all_sandia: update to support setting cxxstandard --- scripts/test_all_sandia | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 09a4d43de4..1d2917d2c9 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -143,6 +143,9 @@ do --cxxflags-extra*) CXX_FLAGS_EXTRA="${key#*=}" ;; + --cxxstandard*) + CXX_STANDARD="${key#*=}" + ;; --ldflags-extra*) LD_FLAGS_EXTRA="${key#*=}" ;; @@ -643,6 +646,8 @@ single_build_and_test() { local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" + local cxx_standard="${CXX_STANDARD}" + if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" fi @@ -662,7 +667,7 @@ single_build_and_test() { run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } fi else - run_cmd ${KOKKOSKERNELS_PATH}/scripts/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} --with-tpls=${KOKKOSKERNELS_ENABLE_TPLS} &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/scripts/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $extra_args --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} --with-tpls=${KOKKOSKERNELS_ENABLE_TPLS} &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local -i build_start_time=$(date +%s) run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } local -i build_end_time=$(date +%s) From 22f9d0ee9366f2cd3a006cbcba7166357956a74f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 5 Dec 2018 16:17:53 -0700 Subject: [PATCH 10/47] src/Makefile, generate_makefile: Fixes for make clean Updated so that calling 'make clean' works without errors. --- scripts/generate_makefile.bash | 7 ++++++- src/Makefile | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/generate_makefile.bash b/scripts/generate_makefile.bash index 82624cbfdf..2639f5d0a5 100755 --- a/scripts/generate_makefile.bash +++ b/scripts/generate_makefile.bash @@ -357,9 +357,14 @@ else fi mkdir -p install -echo "#Makefile to satisfy existens of target kokkos-clean before installing the library" > install/Makefile.kokkos +echo "#Makefile to satisfy existence of target kokkos-clean before installing the library" > install/Makefile.kokkos echo "kokkos-clean:" >> install/Makefile.kokkos echo "" >> install/Makefile.kokkos +echo "#Makefile to satisfy existence of target kokkos-clean and kokkoskernels-clean before installing the library" > install/Makefile.kokkos-kernels +echo "kokkoskernels-clean:" >> install/Makefile.kokkos-kernels +echo "" >> install/Makefile.kokkos-kernels +echo "kokkos-clean:" >> install/Makefile.kokkos-kernels +echo "" >> install/Makefile.kokkos-kernels mkdir -p kokkos mkdir -p src mkdir -p unit_test diff --git a/src/Makefile b/src/Makefile index 8a0cd63e13..de1d3d9cfc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -93,3 +93,7 @@ build: kokkoskernels-build-lib install: kokkoskernels-install +clean: kokkoskernels-clean kokkos-clean + rm -f *.o +# rm -f *.o $(KOKKOSKERNELS_INTERNAL_LIBRARY) KokkosKernels_config.h KokkosKernels_config.tmp KokkosCore_config.h KokkosCore_config.tmp + From b8b1b0db9c86e2fca988094ae905fece034c3559 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 10 Dec 2018 10:02:27 -0700 Subject: [PATCH 11/47] test_all_sandia: update for white Change xl to 16.1.1 Add actual spot-check list --- scripts/test_all_sandia | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 1d2917d2c9..244addf8a6 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -243,7 +243,7 @@ elif [ "$MACHINE" = "white" ]; then BASE_MODULE_LIST="/" IBM_MODULE_LIST="/xl/" - CUDA_MODULE_LIST="/,gcc/7.2.0,ibm/xl/16.1.0" + CUDA_MODULE_LIST="/,gcc/7.2.0,ibm/xl/16.1.1" module load netlib/3.8.0/gcc export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a" @@ -251,12 +251,21 @@ elif [ "$MACHINE" = "white" ]; then # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.0 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=Power8,Kepler37" From e85046861a23a7f292c923f810bc5b0f4b609de7 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 10 Dec 2018 11:00:50 -0700 Subject: [PATCH 12/47] test_all_sandia: update xl compilers Add xl 16.1.0 and 16.1.1 for testing --- scripts/test_all_sandia | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 244addf8a6..fc8ebb8a73 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -243,7 +243,7 @@ elif [ "$MACHINE" = "white" ]; then BASE_MODULE_LIST="/" IBM_MODULE_LIST="/xl/" - CUDA_MODULE_LIST="/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA_MODULE_LIST="/,gcc/7.2.0,ibm/xl/16.1.0" module load netlib/3.8.0/gcc export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a" @@ -263,6 +263,7 @@ elif [ "$MACHINE" = "white" ]; then COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi From bb933187968cb58e350c4bd1017374e31f7db0d3 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Mon, 10 Dec 2018 19:36:30 -0700 Subject: [PATCH 13/47] C++14 Workaround issue #349 --- .../KokkosBatched_Gemm_Team_Internal.hpp | 3 +- .../KokkosBatched_Gemv_Team_Internal.hpp | 3 +- .../KokkosBatched_LU_Team_Internal.hpp | 14 +++-- .../KokkosBatched_Trsm_Team_Internal.hpp | 26 +++++--- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 63 +++++++++++++++++++ 5 files changed, 93 insertions(+), 16 deletions(-) diff --git a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp index 46d3c757ff..f566610d1e 100644 --- a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp @@ -120,7 +120,8 @@ namespace KokkosBatched { const ValueType *__restrict__ AA, const ValueType *__restrict__ BB, /**/ ValueType *__restrict__ CC) { - const int + // Made this non-const in order to WORKAROUND issue #349 + int mb = mbAlgo, mp = (ib%mb), mq = (ib/mb) + (mp>0), nb = nbAlgo, np = (jb%nb), nq = (jb/nb) + (np>0); diff --git a/src/batched/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/KokkosBatched_Gemv_Team_Internal.hpp index ab15b46ae1..0a8673a020 100644 --- a/src/batched/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Gemv_Team_Internal.hpp @@ -112,7 +112,8 @@ namespace KokkosBatched { InnerMultipleDotProduct inner(as0, as1, xs0, ys0); const int tsize = member.team_size(); const int mb_a = m/tsize + (m%tsize>0), mb_b = mbAlgo; - const int mb = mb_a < mb_b ? mb_a : mb_b, mp = m%mb; + // Made this non-const in order to WORKAROUND issue #349 + int mb = mb_a < mb_b ? mb_a : mb_b, mp = m%mb; Kokkos::parallel_for (Kokkos::TeamThreadRange(member, (m/mb) + (mp>0)), diff --git a/src/batched/KokkosBatched_LU_Team_Internal.hpp b/src/batched/KokkosBatched_LU_Team_Internal.hpp index a2a2eae933..5fc23cd3ff 100644 --- a/src/batched/KokkosBatched_LU_Team_Internal.hpp +++ b/src/batched/KokkosBatched_LU_Team_Internal.hpp @@ -48,7 +48,9 @@ namespace KokkosBatched { const auto minus_abs_tiny = -abs_tiny; for (int p=0;p 0? - ((jb-mb) + (ib-mb))/tsize + (((jb-mb) + (ib-mb))%tsize > 0): + // Made this non-const in order to WORKAROUND issue #349 + int mb = mbAlgo; + int nb = ((jb-mb) + (ib-mb)) > 0? + ((jb-mb) + (ib-mb))/tsize + (((jb-mb) + (ib-mb))%tsize > 0): 1; const int kb = ib < jb ? ib : jb; @@ -129,7 +132,8 @@ namespace KokkosBatched { lu.serial_invoke(pb, Ap); member.team_barrier(); - const int + // Made this non-const in order to WORKAROUND issue #349 + int m_abr = ib-p-mb, n_abr = jb-p-mb, mp_abr = m_abr%nb, np_abr = n_abr%nb, mq_abr = (m_abr/nb)+(mp_abr>0), nq_abr = (n_abr/nb)+(np_abr>0); diff --git a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp index ab8dceb1e4..1bac956e0a 100644 --- a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp @@ -56,7 +56,9 @@ namespace KokkosBatched { if (m <= 0 || n <= 0) return 0; for (int p=0;p 0); - const int np = jb%nb; + // Made this non-const in order to WORKAROUND issue #349 + int nb = (jb/tsize + jb%tsize > 0); + int np = jb%nb; for (int p=0;p ib ? (ib-p) : mb); + // Made this non-const in order to WORKAROUND issue #349 + int pb = ((p+mb) > ib ? (ib-p) : mb); // trsm update const ValueType *__restrict__ Ap = AA+p*as0+p*as1; /**/ ValueType *__restrict__ Bp = BB+p*bs0; member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,(jb/nb)+(np>0)),[&](const int &jj) { - const int j = jj*nb, qb = (j+nb) > jb ? np : nb; + Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,(jb/nb)+(np>0)),[&](const int jj) { + // Made this non-const in order to WORKAROUND issue #349 + int j = jj*nb, qb = (j+nb) > jb ? np : nb; if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, qb, Bp+j*bs1); else trsm_n.serial_invoke(Ap, pb, qb, Bp+j*bs1); }); @@ -202,7 +207,9 @@ namespace KokkosBatched { ValueType *__restrict__ B0 = B; for (int p=(m-1);p>=0;--p) { - const int iend = p, jend = n; + // Made this non-const in order to WORKAROUND issue #349 + int iend = p; + int jend = n; const ValueType *__restrict__ a01 = A+p*as1; /**/ ValueType *__restrict__ b1t = B+p*bs0; @@ -266,8 +273,9 @@ namespace KokkosBatched { /**/ ValueType *__restrict__ BB) { const int mb = mbAlgo; //(ib <=5 ? ib : mbAlgo); const int tsize = member.team_size(); - const int nb = (jb/tsize + jb%tsize > 0); - const int np = jb%nb; + // Made this non-const in order to WORKAROUND issue #349 + int nb = (jb/tsize + jb%tsize > 0); + int np = jb%nb; for (int pp=0;pp +#ifdef KOKKOS_ENABLE_CXX14 +#ifdef KOKKOS_COMPILER_GNU +#if KOKKOS_COMPILER_GNU<=720 +#define KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND +#endif +#endif +#endif + namespace KokkosBlas { namespace Impl { @@ -77,16 +85,26 @@ struct impl_deep_copy_matrix_block Date: Tue, 11 Dec 2018 00:42:47 -0700 Subject: [PATCH 14/47] Fix equivalent view check --- src/blas/impl/KokkosBlas1_abs_impl.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_abs_impl.hpp b/src/blas/impl/KokkosBlas1_abs_impl.hpp index ef26674e0a..7dda36fc0a 100644 --- a/src/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/src/blas/impl/KokkosBlas1_abs_impl.hpp @@ -197,7 +197,9 @@ MV_Abs_Generic (const RMV& R, const XMV& X) const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy (0, numRows); - if (R == X) { // if R and X are the same (alias one another) + if(std::is_same::value && + std::is_same::value && + ((void*) (R.data()) == (void*) (X.data()))) { // if R and X are the same (alias one another) MV_AbsSelf_Functor op (R); Kokkos::parallel_for (policy, op); } @@ -225,7 +227,9 @@ V_Abs_Generic (const RV& R, const XV& X) const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy (0, numRows); - if (R == X) { // if R and X are the same (alias one another) + if(std::is_same::value && + std::is_same::value && + ((void*) (R.data()) == (void*) (X.data()))) { // if R and X are the same (alias one another) V_AbsSelf_Functor op (R); Kokkos::parallel_for (policy, op); } From b088ea8a92ec7627a32af492f8abac0b5f8b3891 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 11 Dec 2018 02:46:02 -0700 Subject: [PATCH 15/47] Fix equivalent view check --- src/blas/impl/KokkosBlas1_abs_impl.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_abs_impl.hpp b/src/blas/impl/KokkosBlas1_abs_impl.hpp index 7dda36fc0a..2da5e976e7 100644 --- a/src/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/src/blas/impl/KokkosBlas1_abs_impl.hpp @@ -197,8 +197,8 @@ MV_Abs_Generic (const RMV& R, const XMV& X) const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy (0, numRows); - if(std::is_same::value && - std::is_same::value && + if(std::is_same::value && + std::is_same::value && ((void*) (R.data()) == (void*) (X.data()))) { // if R and X are the same (alias one another) MV_AbsSelf_Functor op (R); Kokkos::parallel_for (policy, op); @@ -227,8 +227,8 @@ V_Abs_Generic (const RV& R, const XV& X) const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy (0, numRows); - if(std::is_same::value && - std::is_same::value && + if(std::is_same::value && + std::is_same::value && ((void*) (R.data()) == (void*) (X.data()))) { // if R and X are the same (alias one another) V_AbsSelf_Functor op (R); Kokkos::parallel_for (policy, op); From ec26b326a1ce0436117816abd57339f61972219a Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 11:03:31 -0700 Subject: [PATCH 16/47] KokkosBlas - add profile regions in tpl --- .../tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 16 ++++++++++++++++ src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 16 ++++++++++++++++ src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 16 ++++++++++++++++ src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 16 ++++++++++++++++ .../tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp | 16 ++++++++++++++++ src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 16 ++++++++++++++++ 6 files changed, 96 insertions(+) diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index b48709969f..fea7103ea9 100644 --- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -90,6 +90,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ if((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ axpby_print_specialization(); \ int N = X.extent(0); \ @@ -97,6 +98,7 @@ struct Axpby< \ daxpy_(&N,&alpha,X.data(),&one,Y.data(),&one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -120,6 +122,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ if((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ axpby_print_specialization(); \ int N = X.extent(0); \ @@ -127,6 +130,7 @@ struct Axpby< \ saxpy_(&N,&alpha,X.data(),&one,Y.data(),&one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -149,6 +153,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex]"); \ if((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ axpby_print_specialization(); \ int N = X.extent(0); \ @@ -158,6 +163,7 @@ struct Axpby< \ reinterpret_cast* >(Y.data()),&one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -180,6 +186,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex]"); \ if((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ axpby_print_specialization(); \ int N = X.extent(0); \ @@ -189,6 +196,7 @@ struct Axpby< \ reinterpret_cast* >(Y.data()),&one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -240,6 +248,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if((numElems < static_cast (INT_MAX)) && (beta == 1.0)) { \ axpby_print_specialization(); \ @@ -249,6 +258,7 @@ struct Axpby< \ cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -273,6 +283,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if((numElems < static_cast (INT_MAX)) && (beta == 1.0f)) { \ axpby_print_specialization(); \ @@ -282,6 +293,7 @@ struct Axpby< \ cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -305,6 +317,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ if((numElems < static_cast (INT_MAX)) && (beta == 1.0f)) { \ axpby_print_specialization(); \ @@ -314,6 +327,7 @@ struct Axpby< \ cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -337,6 +351,7 @@ struct Axpby< \ \ static void \ axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ if((numElems < static_cast (INT_MAX)) && (beta == 1.0f)) { \ axpby_print_specialization(); \ @@ -346,6 +361,7 @@ struct Axpby< \ cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ + Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 3d2bbc3c56..a5ac866517 100644 --- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -88,6 +88,7 @@ Kokkos::View, \ \ static void dot (RV& R, const XV& X, const XV& Y) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -97,6 +98,7 @@ Kokkos::View, \ } else { \ Dot::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -119,6 +121,7 @@ Kokkos::View, \ \ static void dot (RV& R, const XV& X, const XV& Y) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -128,6 +131,7 @@ Kokkos::View, \ } else { \ Dot::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -150,6 +154,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -162,6 +167,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -184,6 +190,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -196,6 +203,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -242,6 +250,7 @@ Kokkos::View, \ \ static void dot (RV& R, const XV& X, const XV& Y) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -252,6 +261,7 @@ Kokkos::View, \ } else { \ Dot::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -274,6 +284,7 @@ Kokkos::View, \ \ static void dot (RV& R, const XV& X, const XV& Y) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -284,6 +295,7 @@ Kokkos::View, \ } else { \ Dot::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -306,6 +318,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -316,6 +329,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -338,6 +352,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ dot_print_specialization(); \ @@ -348,6 +363,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::dot(R,X,Y); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index c791454dcb..e185ca91b7 100644 --- a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -82,6 +82,7 @@ Kokkos::View, \ \ static void nrm1 (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -91,6 +92,7 @@ Kokkos::View, \ } else { \ Nrm1::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -111,6 +113,7 @@ Kokkos::View, \ \ static void nrm1 (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -120,6 +123,7 @@ Kokkos::View, \ } else { \ Nrm1::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -140,6 +144,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -149,6 +154,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -169,6 +175,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -178,6 +185,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -222,6 +230,7 @@ Kokkos::View, \ \ static void nrm1 (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -232,6 +241,7 @@ Kokkos::View, \ } else { \ Nrm1::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -252,6 +262,7 @@ Kokkos::View, \ \ static void nrm1 (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -262,6 +273,7 @@ Kokkos::View, \ } else { \ Nrm1::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -282,6 +294,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -292,6 +305,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -312,6 +326,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm1_print_specialization(); \ @@ -322,6 +337,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm1(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 2a4facce96..640adb0c49 100644 --- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -82,6 +82,7 @@ Kokkos::View, \ \ static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -92,6 +93,7 @@ Kokkos::View, \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -112,6 +114,7 @@ Kokkos::View, \ \ static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -122,6 +125,7 @@ Kokkos::View, \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -142,6 +146,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -152,6 +157,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -172,6 +178,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -182,6 +189,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -226,6 +234,7 @@ Kokkos::View, \ \ static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -237,6 +246,7 @@ Kokkos::View, \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -257,6 +267,7 @@ Kokkos::View, \ \ static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -268,6 +279,7 @@ Kokkos::View, \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -288,6 +300,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -299,6 +312,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -319,6 +333,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast (INT_MAX)) { \ nrm2_print_specialization(); \ @@ -330,6 +345,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrm2(R,X,take_sqrt); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index 8ee9d5f19f..e0661fee21 100644 --- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -82,6 +82,7 @@ Kokkos::View, \ \ static void nrminf (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { R() = 0.0; return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -93,6 +94,7 @@ Kokkos::View, \ } else { \ NrmInf::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -113,6 +115,7 @@ Kokkos::View, \ \ static void nrminf (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { R() = 0.0f; return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -124,6 +127,7 @@ Kokkos::View, \ } else { \ NrmInf::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -145,6 +149,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { R() = 0.0; return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -156,6 +161,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -177,6 +183,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { R() = 0.0f; return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -188,6 +195,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -232,6 +240,7 @@ Kokkos::View, \ \ static void nrminf (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -245,6 +254,7 @@ Kokkos::View, \ } else { \ NrmInf::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -265,6 +275,7 @@ Kokkos::View, \ \ static void nrminf (RV& R, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { Kokkos::deep_copy (R, 0.0f);; return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -278,6 +289,7 @@ Kokkos::View, \ } else { \ NrmInf::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -299,6 +311,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -315,6 +328,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -336,6 +350,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { Kokkos::deep_copy (R, 0.0f); return; } \ if (numElems < static_cast (INT_MAX)) { \ @@ -352,6 +367,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::nrminf(R,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 1430744702..cb24ace50c 100644 --- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -88,6 +88,7 @@ Kokkos::View, \ \ static void scal (const RV& R, const double& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -97,6 +98,7 @@ Kokkos::View, \ } else { \ Scal::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -119,6 +121,7 @@ Kokkos::View, \ \ static void scal (const RV& R, const float& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -128,6 +131,7 @@ Kokkos::View, \ } else { \ Scal::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -150,6 +154,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -159,6 +164,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -181,6 +187,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -190,6 +197,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -236,6 +244,7 @@ Kokkos::View, \ \ static void scal (const RV& R, const double& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -246,6 +255,7 @@ Kokkos::View, \ } else { \ Scal::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -268,6 +278,7 @@ Kokkos::View, \ \ static void scal (const RV& R, const float& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -278,6 +289,7 @@ Kokkos::View, \ } else { \ Scal::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -300,6 +312,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -310,6 +323,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -332,6 +346,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device& alpha, const XV& X) \ { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast (INT_MAX)) && (R.data() == X.data())) { \ scal_print_specialization(); \ @@ -342,6 +357,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device::scal(R,alpha,X); \ } \ + Kokkos::Profiling::popRegion(); \ } \ }; From a5cfab0216809dfe77b369a55f8836a2d862a109 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 11:29:35 -0700 Subject: [PATCH 17/47] KokkosBlas - add regions to src directory --- src/blas/impl/KokkosBlas1_abs_spec.hpp | 8 +++++--- src/blas/impl/KokkosBlas1_axpby_spec.hpp | 8 ++++++-- src/blas/impl/KokkosBlas1_dot_spec.hpp | 4 ++++ src/blas/impl/KokkosBlas1_mult_spec.hpp | 6 ++++-- src/blas/impl/KokkosBlas1_nrm1_spec.hpp | 6 ++++-- src/blas/impl/KokkosBlas1_nrm2_spec.hpp | 6 ++++-- src/blas/impl/KokkosBlas1_nrm2w_spec.hpp | 6 ++++-- src/blas/impl/KokkosBlas1_nrminf_spec.hpp | 6 ++++-- src/blas/impl/KokkosBlas1_reciprocal_spec.hpp | 6 ++++-- src/blas/impl/KokkosBlas1_scal_spec.hpp | 9 ++++++--- src/blas/impl/KokkosBlas1_sum_spec.hpp | 5 ++++- src/blas/impl/KokkosBlas1_update_spec.hpp | 6 ++++-- src/blas/impl/KokkosBlas2_gemv_spec.hpp | 3 ++- 13 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_abs_spec.hpp b/src/blas/impl/KokkosBlas1_abs_spec.hpp index 9c3ea8e82a..e973aae492 100644 --- a/src/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/src/blas/impl/KokkosBlas1_abs_spec.hpp @@ -127,7 +127,7 @@ struct Abs "RMV is not rank 1."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Abs<1-D>: " "XMV is not rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -145,9 +145,10 @@ struct Abs typedef std::int64_t index_type; V_Abs_Generic (R, X); } + Kokkos::Profiling::popRegion(); } }; - + template struct Abs { typedef typename XMV::size_type size_type; @@ -162,7 +163,7 @@ struct Abs { "RMV is not rank 2."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Abs<2-D>: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -182,6 +183,7 @@ struct Abs { typedef std::int64_t index_type; MV_Abs_Generic (R, X); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_axpby_spec.hpp b/src/blas/impl/KokkosBlas1_axpby_spec.hpp index 50ee490cca..f955d92fe3 100644 --- a/src/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/src/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -179,7 +179,7 @@ struct Axpby "X and Y must have the same rank."); static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby::axpby: " "X and Y must have rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XMV).name(),typeid(BV).name(),typeid(YMV).name()); @@ -213,6 +213,7 @@ struct Axpby Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; Axpby_MV_Invoke_Layout::run(av, X, bv, Y, a, b); } + Kokkos::Profiling::popRegion(); } }; @@ -247,7 +248,7 @@ struct Axpby::axpby: " "X and Y must have rank 1."); + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XV).name(),typeid(BV).name(),typeid(YV).name()); @@ -386,6 +389,7 @@ struct Axpby (alpha, X, beta, Y, 0, a, b); } + Kokkos::Profiling::popRegion(); } }; #endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY diff --git a/src/blas/impl/KokkosBlas1_dot_spec.hpp b/src/blas/impl/KokkosBlas1_dot_spec.hpp index 3b23c31a03..1c5322f8c3 100644 --- a/src/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/src/blas/impl/KokkosBlas1_dot_spec.hpp @@ -177,6 +177,7 @@ struct Dot "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n",typeid(XV).name(),typeid(YV).name()); @@ -196,6 +197,7 @@ struct Dot DotFunctor f(X,Y); f.run("KokkosBlas::dot<1D>",R); } + Kokkos::Profiling::popRegion(); } }; @@ -212,6 +214,7 @@ struct Dot: " "RV is not rank 1."); + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(XV).name(),typeid(YV).name()); @@ -231,6 +234,7 @@ struct Dot::dot(R,X,Y); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_mult_spec.hpp b/src/blas/impl/KokkosBlas1_mult_spec.hpp index 7e416683e5..3cbc52397d 100644 --- a/src/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/src/blas/impl/KokkosBlas1_mult_spec.hpp @@ -167,7 +167,7 @@ struct Mult "X, and Y must have the rank 2."); static_assert (AV::rank == 1, "KokkosBlas::Impl::Mult::mult: " "AV must have rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -187,6 +187,7 @@ struct Mult else { MV_Mult_Generic (gamma, Y, alpha, A, X); } + Kokkos::Profiling::popRegion(); } }; @@ -218,7 +219,7 @@ struct Mult static_assert ((int) XV::rank == (int) YV::rank && (int) AV::rank == 1, "KokkosBlas::Impl::Mult::mult: " "X, Y, and Z must have rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n",typeid(YV).name(),typeid(AV).name(),typeid(XV).name()); @@ -234,6 +235,7 @@ struct Mult else { V_Mult_Generic (gamma, Y, alpha, A, X); } + Kokkos::Profiling::popRegion(); } }; #endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY diff --git a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp index 5f7eafffc3..095ace371e 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -131,7 +131,7 @@ struct Nrm1 "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm1<1-D>: " "XMV is not rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -148,6 +148,7 @@ struct Nrm1 typedef std::int64_t index_type; V_Nrm1_Invoke (R, X); } + Kokkos::Profiling::popRegion(); } }; @@ -174,7 +175,7 @@ struct Nrm1 { printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); } #endif - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]"); const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); if (numRows < static_cast (INT_MAX) && @@ -185,6 +186,7 @@ struct Nrm1 { typedef std::int64_t index_type; MV_Nrm1_Invoke (R, X); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp index 910fa12ae5..254b9ca047 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -131,7 +131,7 @@ struct Nrm2 "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm2<1-D>: " "XMV is not rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -148,6 +148,7 @@ struct Nrm2 typedef std::int64_t index_type; V_Nrm2_Invoke (R, X, take_sqrt); } + Kokkos::Profiling::popRegion(); } }; @@ -166,7 +167,7 @@ struct Nrm2 { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Nrm2<2-D>: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); @@ -185,6 +186,7 @@ struct Nrm2 { typedef std::int64_t index_type; MV_Nrm2_Invoke (R, X, take_sqrt); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp index ef0a8a4855..cd52c518b3 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -127,7 +127,7 @@ struct Nrm2w "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm2w<1-D>: " "XMV is not rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -144,6 +144,7 @@ struct Nrm2w typedef std::int64_t index_type; V_Nrm2w_Invoke (R, X, W, take_sqrt); } + Kokkos::Profiling::popRegion(); } }; @@ -162,7 +163,7 @@ struct Nrm2w { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Nrm2w<2-D>: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); @@ -181,6 +182,7 @@ struct Nrm2w { typedef std::int64_t index_type; MV_Nrm2w_Invoke (R, X, W, take_sqrt); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_nrminf_spec.hpp b/src/blas/impl/KokkosBlas1_nrminf_spec.hpp index 40379d968e..458666594f 100644 --- a/src/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -131,7 +131,7 @@ struct NrmInf "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::NrmInf<1-D>: " "XMV is not rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -148,6 +148,7 @@ struct NrmInf typedef std::int64_t index_type; V_NrmInf_Invoke (R, X); } + Kokkos::Profiling::popRegion(); } }; @@ -166,7 +167,7 @@ struct NrmInf { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::NrmInf<2-D>: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); @@ -185,6 +186,7 @@ struct NrmInf { typedef std::int64_t index_type; MV_NrmInf_Invoke (R, X); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 03b0cd2fff..3eee2fc174 100644 --- a/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -127,7 +127,7 @@ struct Reciprocal "RMV is not rank 1."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Reciprocal<1-D>: " "XMV is not rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -145,6 +145,7 @@ struct Reciprocal typedef std::int64_t index_type; V_Reciprocal_Generic (R, X); } + Kokkos::Profiling::popRegion(); } }; @@ -162,7 +163,7 @@ struct Reciprocal { "RMV is not rank 2."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Reciprocal<2-D>: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -182,6 +183,7 @@ struct Reciprocal { typedef std::int64_t index_type; MV_Reciprocal_Generic (R, X); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_scal_spec.hpp b/src/blas/impl/KokkosBlas1_scal_spec.hpp index f9f891b4bb..936d15dbce 100644 --- a/src/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/src/blas/impl/KokkosBlas1_scal_spec.hpp @@ -144,7 +144,7 @@ struct Scal: " "XV is not rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -173,6 +173,7 @@ struct Scal (R, alpha, X, a); } + Kokkos::Profiling::popRegion(); } }; @@ -202,7 +203,7 @@ struct Scal { "AV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Scal<2-D>: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n",typeid(RMV).name(),typeid(AV).name(),typeid(XMV).name()); @@ -222,6 +223,7 @@ struct Scal { typedef typename XMV::size_type index_type; MV_Scal_Invoke_Left (R, av, X, a); } + Kokkos::Profiling::popRegion(); } }; @@ -248,7 +250,7 @@ struct Scal: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -281,6 +283,7 @@ struct Scal (R, alpha, X, a); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_sum_spec.hpp b/src/blas/impl/KokkosBlas1_sum_spec.hpp index 0e4fbcc1d4..7476fae98f 100644 --- a/src/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/src/blas/impl/KokkosBlas1_sum_spec.hpp @@ -131,6 +131,7 @@ struct Sum "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Sum<1-D>: " "XMV is not rank 1."); + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -148,6 +149,7 @@ struct Sum typedef std::int64_t index_type; V_Sum_Invoke (R, X); } + Kokkos::Profiling::popRegion(); } }; @@ -166,7 +168,7 @@ struct Sum { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Sum<2-D>: " "XMV is not rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); @@ -185,6 +187,7 @@ struct Sum { typedef std::int64_t index_type; MV_Sum_Invoke (R, X); } + Kokkos::Profiling::popRegion(); } }; #endif diff --git a/src/blas/impl/KokkosBlas1_update_spec.hpp b/src/blas/impl/KokkosBlas1_update_spec.hpp index d98d453c91..872b99aa32 100644 --- a/src/blas/impl/KokkosBlas1_update_spec.hpp +++ b/src/blas/impl/KokkosBlas1_update_spec.hpp @@ -162,7 +162,7 @@ struct Update "X, Y, and Z must have the same rank."); static_assert (ZMV::rank == 2, "KokkosBlas::Impl::Update::update: " "XMV, YMV, and ZMV must have rank 2."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -221,6 +221,7 @@ struct Update MV_Update_Generic (alpha, X, beta, Y, gamma, Z, a, b, c); } } + Kokkos::Profiling::popRegion(); } }; @@ -256,7 +257,7 @@ struct Update "X, Y, and Z must have the same rank."); static_assert (ZV::rank == 1, "KokkosBlas::Impl::Update::update: " "XV, YV, and ZV must have rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n",typeid(XV).name(),typeid(YV).name(),typeid(ZV).name()); @@ -297,6 +298,7 @@ struct Update typedef typename XV::size_type index_type; V_Update_Generic (alpha, X, beta, Y, gamma, Z, a, b, c); } + Kokkos::Profiling::popRegion(); } }; #endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY diff --git a/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_gemv_spec.hpp index a49e83bdcd..b7fe8e3456 100644 --- a/src/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -127,7 +127,7 @@ struct GEMV { "XViewType must have rank 1."); static_assert (static_cast (YViewType::rank) == 1, "YViewType must have rank 1."); - + Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::gemv[ETI]":"KokkosBlas::gemv[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); @@ -142,6 +142,7 @@ struct GEMV { singleLevelGemv (trans, alpha, A, x, beta, y); } + Kokkos::Profiling::popRegion(); } #else ; From d4937765b4b424a1b067b2d9a00c3e7748b3212f Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 11:32:01 -0700 Subject: [PATCH 18/47] KokkosBlas - add profiling to fill --- src/blas/KokkosBlas1_fill.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/blas/KokkosBlas1_fill.hpp b/src/blas/KokkosBlas1_fill.hpp index 381a274f5f..ddf7088a24 100644 --- a/src/blas/KokkosBlas1_fill.hpp +++ b/src/blas/KokkosBlas1_fill.hpp @@ -56,7 +56,9 @@ namespace KokkosBlas { /// \param val [in] Value with which to fill the entries of X. template void fill (const XMV& X, const typename XMV::non_const_value_type& val) { + Kokkos::Profiling::pushRegion("KokkosBlas::fill"); Kokkos::deep_copy(X,val); + Kokkos::Profiling::popRegion(); } } From 93bdd20eef6c9d0314198a58ccaeb5308184c6fb Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 12:16:55 -0700 Subject: [PATCH 19/47] KokkosBlas - add testing regions and parallel for decorations for team test --- unit_test/blas/Test_Blas1_abs.hpp | 16 ++++++++++++++++ unit_test/blas/Test_Blas1_asum.hpp | 8 ++++++++ unit_test/blas/Test_Blas1_axpby.hpp | 19 +++++++++++++++++++ unit_test/blas/Test_Blas1_axpy.hpp | 20 ++++++++++++++++++++ unit_test/blas/Test_Blas1_dot.hpp | 16 ++++++++++++++++ unit_test/blas/Test_Blas1_mult.hpp | 22 +++++++++++++++++++++- unit_test/blas/Test_Blas1_nrm1.hpp | 16 ++++++++++++++++ unit_test/blas/Test_Blas1_nrm2.hpp | 16 ++++++++++++++++ unit_test/blas/Test_Blas1_nrm2_squared.hpp | 16 ++++++++++++++++ unit_test/blas/Test_Blas1_nrminf.hpp | 19 +++++++++++++++++-- unit_test/blas/Test_Blas1_reciprocal.hpp | 16 ++++++++++++++++ unit_test/blas/Test_Blas1_scal.hpp | 22 +++++++++++++++++++++- unit_test/blas/Test_Blas1_sum.hpp | 16 ++++++++++++++++ unit_test/blas/Test_Blas1_team_abs.hpp | 8 ++++---- unit_test/blas/Test_Blas1_team_axpby.hpp | 8 ++++---- unit_test/blas/Test_Blas1_team_axpy.hpp | 8 ++++---- unit_test/blas/Test_Blas1_team_dot.hpp | 16 ++++++++-------- unit_test/blas/Test_Blas1_team_mult.hpp | 10 +++++----- unit_test/blas/Test_Blas1_team_nrm2.hpp | 4 ++-- unit_test/blas/Test_Blas1_team_scal.hpp | 12 ++++++------ unit_test/blas/Test_Blas1_team_update.hpp | 10 +++++----- unit_test/blas/Test_Blas1_update.hpp | 21 ++++++++++++++++++++- unit_test/blas/Test_Blas2_gemv.hpp | 10 ++++++++++ unit_test/blas/Test_Blas2_team_gemv.hpp | 6 +++--- 24 files changed, 289 insertions(+), 46 deletions(-) diff --git a/unit_test/blas/Test_Blas1_abs.hpp b/unit_test/blas/Test_Blas1_abs.hpp index dbc767f94b..aca3936cd3 100644 --- a/unit_test/blas/Test_Blas1_abs.hpp +++ b/unit_test/blas/Test_Blas1_abs.hpp @@ -222,37 +222,53 @@ int test_abs_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, abs_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_float"); test_abs (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, abs_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_float"); test_abs_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, abs_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); test_abs (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, abs_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double"); test_abs_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, abs_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); test_abs,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, abs_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double"); test_abs_mv,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, abs_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_int"); test_abs (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, abs_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_int"); test_abs_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_asum.hpp b/unit_test/blas/Test_Blas1_asum.hpp index 8632ca29bf..bfdfa888f1 100644 --- a/unit_test/blas/Test_Blas1_asum.hpp +++ b/unit_test/blas/Test_Blas1_asum.hpp @@ -82,25 +82,33 @@ int test_asum() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, asum_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_float"); test_asum (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, asum_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_double"); test_asum (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, asum_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_complex_double"); test_asum,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, asum_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_int"); test_asum (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_axpby.hpp b/unit_test/blas/Test_Blas1_axpby.hpp index 9ee10cbc7c..f2bc692d09 100644 --- a/unit_test/blas/Test_Blas1_axpby.hpp +++ b/unit_test/blas/Test_Blas1_axpby.hpp @@ -218,45 +218,64 @@ int test_axpby_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpby_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_float"); test_axpby (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpby_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_float"); test_axpby_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpby_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double"); test_axpby (); } TEST_F( TestCategory, axpby_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double"); test_axpby_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpby_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_complex_double"); test_axpby,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpby_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_complex_double"); test_axpby_mv,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpby_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_int"); test_axpby (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpby_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_int"); test_axpby_mv (); + Kokkos::Profiling::popRegion(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, axpby_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double_int"); test_axpby (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpby_double_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double_int"); test_axpby_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_axpy.hpp b/unit_test/blas/Test_Blas1_axpy.hpp index 2d8f9c1e8d..bbe586085d 100644 --- a/unit_test/blas/Test_Blas1_axpy.hpp +++ b/unit_test/blas/Test_Blas1_axpy.hpp @@ -213,45 +213,65 @@ int test_axpy_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpy_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_float"); test_axpy (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpy_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_float"); test_axpy_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpy_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double"); test_axpy (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpy_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double"); test_axpy_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpy_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_complex_double"); test_axpy,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpy_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_complex_double"); test_axpy_mv,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, axpy_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_int"); test_axpy (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpy_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_int"); test_axpy_mv (); + Kokkos::Profiling::popRegion(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, axpy_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double_int"); test_axpy (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, axpy_double_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double_int"); test_axpy_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index 755e0e8f49..4875020942 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -217,37 +217,53 @@ int test_dot_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, dot_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_float"); test_dot (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, dot_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_float"); test_dot_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, dot_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_double"); test_dot (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, dot_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_double"); test_dot_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, dot_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_complex_double"); test_dot,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, dot_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_complex_double"); test_dot_mv,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, dot_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_int"); test_dot (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, dot_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_int"); test_dot_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_mult.hpp b/unit_test/blas/Test_Blas1_mult.hpp index 48a356bbd5..4e760cd29f 100644 --- a/unit_test/blas/Test_Blas1_mult.hpp +++ b/unit_test/blas/Test_Blas1_mult.hpp @@ -249,45 +249,65 @@ int test_mult_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, mult_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); test_mult (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, mult_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); test_mult_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, mult_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double"); test_mult (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, mult_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double"); test_mult_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, mult_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_complex_double"); test_mult,Kokkos::complex,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, mult_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_complex_double"); test_mult_mv,Kokkos::complex,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, mult_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_int"); test_mult (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, mult_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_int"); test_mult_mv (); + Kokkos::Profiling::popRegion(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, mult_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double_int"); test_mult (); + Kokkos::Profiling::popRegion(); } -TEST_F( TestCategory, mult_double_mv_int ) { +TEST_F( TestCategory, mult_mv_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double_int"); test_mult_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp index bd9d3e6fc7..e4ea9df1a7 100644 --- a/unit_test/blas/Test_Blas1_nrm1.hpp +++ b/unit_test/blas/Test_Blas1_nrm1.hpp @@ -165,37 +165,53 @@ int test_nrm1_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm1_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_float"); test_nrm1 (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm1_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_float"); test_nrm1_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm1_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_double"); test_nrm1 (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm1_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_double"); test_nrm1_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm1_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_complex_double"); test_nrm1,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm1_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_complex_double"); test_nrm1_mv,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm1_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_int"); test_nrm1 (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm1_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_int"); test_nrm1_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp index 3d2233d051..699c1aa806 100644 --- a/unit_test/blas/Test_Blas1_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_nrm2.hpp @@ -167,37 +167,53 @@ int test_nrm2_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_float"); test_nrm2 (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_float"); test_nrm2_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_double"); test_nrm2 (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_double"); test_nrm2_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_complex_double"); test_nrm2,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_complex_double"); test_nrm2_mv,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_int"); test_nrm2 (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_int"); test_nrm2_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index f4d93e7217..5af85a7284 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -171,37 +171,53 @@ int test_nrm2_squared_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_squared_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_float"); test_nrm2_squared (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_squared_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_float"); test_nrm2_squared_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_squared_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_double"); test_nrm2_squared (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_squared_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_double"); test_nrm2_squared_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_squared_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_complex_double"); test_nrm2_squared,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_squared_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_complex_double"); test_nrm2_squared_mv,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrm2_squared_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_int"); test_nrm2_squared (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrm2_squared_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_int"); test_nrm2_squared_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_nrminf.hpp b/unit_test/blas/Test_Blas1_nrminf.hpp index d76c9bd965..e1b0f6a238 100644 --- a/unit_test/blas/Test_Blas1_nrminf.hpp +++ b/unit_test/blas/Test_Blas1_nrminf.hpp @@ -166,42 +166,57 @@ int test_nrminf_mv() { Test::impl_test_nrminf_mv(132231,5); #endif - return 1; -} + return 1;} #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrminf_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_float"); test_nrminf (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrminf_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mvfloat"); test_nrminf_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrminf_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_double"); test_nrminf (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrminf_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_double"); test_nrminf_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrminf_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_complex_double"); test_nrminf,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrminf_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_complex_double"); test_nrminf_mv,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, nrminf_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_int"); test_nrminf (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, nrminf_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_int"); test_nrminf_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_reciprocal.hpp b/unit_test/blas/Test_Blas1_reciprocal.hpp index 2b90248c7c..b0e4ad6b80 100644 --- a/unit_test/blas/Test_Blas1_reciprocal.hpp +++ b/unit_test/blas/Test_Blas1_reciprocal.hpp @@ -227,37 +227,53 @@ int test_reciprocal_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, reciprocal_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_float"); test_reciprocal (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, reciprocal_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_float"); test_reciprocal_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, reciprocal_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_double"); test_reciprocal (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, reciprocal_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_double"); test_reciprocal_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, reciprocal_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_complex_double"); test_reciprocal,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, reciprocal_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_complex_double"); test_reciprocal_mv,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, reciprocal_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_int"); test_reciprocal (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, reciprocal_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_int"); test_reciprocal_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_scal.hpp b/unit_test/blas/Test_Blas1_scal.hpp index fda9fa43c5..fd406767ea 100644 --- a/unit_test/blas/Test_Blas1_scal.hpp +++ b/unit_test/blas/Test_Blas1_scal.hpp @@ -268,45 +268,65 @@ int test_scal_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, scal_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_float"); test_scal (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, scal_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_float"); test_scal_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, scal_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double"); test_scal (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, scal_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double"); test_scal_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, scal_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_complex_double"); test_scal,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, scal_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_complex_double"); test_scal_mv,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, scal_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_int"); test_scal (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, scal_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_int"); test_scal_mv (); + Kokkos::Profiling::popRegion(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, scal_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double_int"); test_scal (); + Kokkos::Profiling::popRegion(); } -TEST_F( TestCategory, scal_double_mv_int ) { +TEST_F( TestCategory, scal_mv_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double_int"); test_scal_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp index b9804b32d2..bd345a5666 100644 --- a/unit_test/blas/Test_Blas1_sum.hpp +++ b/unit_test/blas/Test_Blas1_sum.hpp @@ -164,37 +164,53 @@ int test_sum_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, sum_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_float"); test_sum (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, sum_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_float"); test_sum_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, sum_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_double"); test_sum (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, sum_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_double"); test_sum_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, sum_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_complex_double"); test_sum,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, sum_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_complex_double"); test_sum_mv,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, sum_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_int"); test_sum (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, sum_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_int"); test_sum_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas1_team_abs.hpp b/unit_test/blas/Test_Blas1_team_abs.hpp index a54f96296e..ca89b2c090 100644 --- a/unit_test/blas/Test_Blas1_team_abs.hpp +++ b/unit_test/blas/Test_Blas1_team_abs.hpp @@ -63,7 +63,7 @@ namespace Test { expected_result += AT::abs(h_x(i)) * AT::abs(h_x(i)); //KokkosBlas::abs(y,x); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -74,7 +74,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::abs(y,c_x); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -142,7 +142,7 @@ namespace Test { Kokkos::View r("Dot::Result",K); //KokkosBlas::abs(y,x); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), Kokkos::subview(x,Kokkos::ALL(),teamId)); } ); @@ -159,7 +159,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::abs(y,c_x); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), Kokkos::subview(c_x,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_team_axpby.hpp b/unit_test/blas/Test_Blas1_team_axpby.hpp index c4274d38a0..b0aa24cbe6 100644 --- a/unit_test/blas/Test_Blas1_team_axpby.hpp +++ b/unit_test/blas/Test_Blas1_team_axpby.hpp @@ -65,7 +65,7 @@ namespace Test { expected_result += ScalarB(a*h_x(i) + b*h_y(i)) * ScalarB(a*h_x(i) + b*h_y(i)); //KokkosBlas::axpby(a,x,b,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -76,7 +76,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::axpby(a,c_x,b,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -145,7 +145,7 @@ namespace Test { typedef Kokkos::Details::ArithTraits AT; //KokkosBlas::axpby(a,x,b,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId)); } ); @@ -159,7 +159,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::axpby(a,c_x,b,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(c_x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_team_axpy.hpp b/unit_test/blas/Test_Blas1_team_axpy.hpp index 573699163b..eba3e51eab 100644 --- a/unit_test/blas/Test_Blas1_team_axpy.hpp +++ b/unit_test/blas/Test_Blas1_team_axpy.hpp @@ -64,7 +64,7 @@ namespace Test { expected_result += ScalarB(a*h_x(i) + h_y(i)) * ScalarB(a*h_x(i) + h_y(i)); //KokkosBlas::axpy(a,x,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -75,7 +75,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::axpy(a,c_x,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -140,7 +140,7 @@ namespace Test { Kokkos::View r("Dot::Result",K); //KokkosBlas::axpy(a,x,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x,Kokkos::ALL(),teamId), Kokkos::subview(y,Kokkos::ALL(),teamId)); } ); @@ -154,7 +154,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::axpy(a,c_x,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(c_x,Kokkos::ALL(),teamId), Kokkos::subview(y,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_team_dot.hpp b/unit_test/blas/Test_Blas1_team_dot.hpp index adb80e7763..bac183d4fd 100644 --- a/unit_test/blas/Test_Blas1_team_dot.hpp +++ b/unit_test/blas/Test_Blas1_team_dot.hpp @@ -61,7 +61,7 @@ namespace Test { //ScalarA nonconst_nonconst_result = KokkosBlas::dot(a,b); ScalarA nonconst_nonconst_result = 0; - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -78,7 +78,7 @@ namespace Test { //ScalarA const_const_result = KokkosBlas::dot(c_a,c_b); ScalarA const_const_result = 0; - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -91,7 +91,7 @@ namespace Test { //ScalarA nonconst_const_result = KokkosBlas::dot(a,c_b); ScalarA nonconst_const_result = 0; - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -104,7 +104,7 @@ namespace Test { //ScalarA const_nonconst_result = KokkosBlas::dot(c_a,b); ScalarA const_nonconst_result = 0; - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -171,7 +171,7 @@ namespace Test { Kokkos::View d_r("Dot::Result",K); //KokkosBlas::dot(r,a,b); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::ALL(),teamId), Kokkos::subview(b,Kokkos::ALL(),teamId)); } ); @@ -182,7 +182,7 @@ namespace Test { } //KokkosBlas::dot(r,c_a,c_b); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::ALL(),teamId), Kokkos::subview(c_b,Kokkos::ALL(),teamId)); } ); @@ -193,7 +193,7 @@ namespace Test { } //KokkosBlas::dot(r,a,c_b); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::ALL(),teamId), Kokkos::subview(c_b,Kokkos::ALL(),teamId)); } ); @@ -204,7 +204,7 @@ namespace Test { } //KokkosBlas::dot(r,c_a,b); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::ALL(),teamId), Kokkos::subview(b,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_team_mult.hpp b/unit_test/blas/Test_Blas1_team_mult.hpp index 2adc7a05a6..4c230b1d32 100644 --- a/unit_test/blas/Test_Blas1_team_mult.hpp +++ b/unit_test/blas/Test_Blas1_team_mult.hpp @@ -76,7 +76,7 @@ namespace Test { expected_result += ScalarC(b*h_z(i) + a*h_x(i)*h_y(i)) * ScalarC(b*h_z(i) + a*h_x(i)*h_y(i)); //KokkosBlas::mult(b,z,a,x,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -85,7 +85,7 @@ namespace Test { Kokkos::deep_copy(b_z,b_org_z); //KokkosBlas::mult(b,z,a,x,c_y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -94,7 +94,7 @@ namespace Test { Kokkos::deep_copy(b_z,b_org_z); //KokkosBlas::mult(b,z,a,c_x,c_y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -171,7 +171,7 @@ namespace Test { Kokkos::View r("Dot::Result",K); //KokkosBlas::mult(b,z,a,x,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::ALL(),teamId), a, x, Kokkos::subview(y,Kokkos::ALL(),teamId)); } ); @@ -183,7 +183,7 @@ namespace Test { Kokkos::deep_copy(b_z,b_org_z); //KokkosBlas::mult(b,z,a,x,c_y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::ALL(),teamId), a, x, Kokkos::subview(c_y,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_team_nrm2.hpp b/unit_test/blas/Test_Blas1_team_nrm2.hpp index 455b75755c..7bfb57f96b 100644 --- a/unit_test/blas/Test_Blas1_team_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_team_nrm2.hpp @@ -53,7 +53,7 @@ namespace Test { Kokkos::View d_r("Nrm2::Result",K); //KokkosBlas::nrm2(r,a); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(a,Kokkos::ALL(),teamId)); } ); @@ -64,7 +64,7 @@ namespace Test { } //KokkosBlas::nrm2(r,c_a); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(c_a,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_team_scal.hpp b/unit_test/blas/Test_Blas1_team_scal.hpp index c8d4ade241..46f61a6ca7 100644 --- a/unit_test/blas/Test_Blas1_team_scal.hpp +++ b/unit_test/blas/Test_Blas1_team_scal.hpp @@ -68,7 +68,7 @@ namespace Test { for(int i=0;i r("Dot::Result",K); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), a, Kokkos::subview(x,Kokkos::ALL(),teamId)); } ); @@ -170,7 +170,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), a, Kokkos::subview(c_x,Kokkos::ALL(),teamId)); } ); @@ -197,7 +197,7 @@ namespace Test { { expected_result[j] += ScalarB((3.0+j)*h_x(i,j)) * ScalarB((3.0+j)*h_x(i,j)); } } - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), params(teamId), Kokkos::subview(x,Kokkos::ALL(),teamId)); } ); @@ -212,7 +212,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), params(teamId), Kokkos::subview(c_x,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_team_update.hpp b/unit_test/blas/Test_Blas1_team_update.hpp index 5be4c03892..9545f42829 100644 --- a/unit_test/blas/Test_Blas1_team_update.hpp +++ b/unit_test/blas/Test_Blas1_team_update.hpp @@ -79,7 +79,7 @@ namespace Test { expected_result += ScalarB(c*h_z(i) + a*h_x(i) + b*h_y(i)) * ScalarB(c*h_z(i) + a*h_x(i) + b*h_y(i)); //KokkosBlas::update(a,x,b,y,c,z); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), c, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -88,7 +88,7 @@ namespace Test { Kokkos::deep_copy(b_z,b_org_z); //KokkosBlas::update(a,c_x,b,y,c,z); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), c, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -97,7 +97,7 @@ namespace Test { Kokkos::deep_copy(b_z,b_org_z); //KokkosBlas::update(a,c_x,b,c_y,c,z); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(c_y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), c, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N))); } ); @@ -175,7 +175,7 @@ namespace Test { Kokkos::View r("Dot::Result",K); //KokkosBlas::update(a,x,b,y,c,z); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId), c, Kokkos::subview(z,Kokkos::ALL(),teamId)); } ); @@ -187,7 +187,7 @@ namespace Test { Kokkos::deep_copy(b_z,b_org_z); //KokkosBlas::update(a,c_x,b,y,c,z); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(c_x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId), c, Kokkos::subview(z,Kokkos::ALL(),teamId)); } ); diff --git a/unit_test/blas/Test_Blas1_update.hpp b/unit_test/blas/Test_Blas1_update.hpp index 50301d2453..70e516c3e8 100644 --- a/unit_test/blas/Test_Blas1_update.hpp +++ b/unit_test/blas/Test_Blas1_update.hpp @@ -253,45 +253,64 @@ int test_update_mv() { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, update_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_float"); test_update (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, update_mv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_float"); test_update_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, update_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double"); test_update (); } TEST_F( TestCategory, update_mv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double"); test_update_mv (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, update_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_complex_double"); test_update,Kokkos::complex,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, update_mv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_complex_double"); test_update_mv,Kokkos::complex,Kokkos::complex,TestExecSpace> (); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, update_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_int"); test_update (); + Kokkos::Profiling::popRegion(); } TEST_F( TestCategory, update_mv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_int"); test_update_mv (); + Kokkos::Profiling::popRegion(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, update_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double_int"); test_update (); + Kokkos::Profiling::popRegion(); } -TEST_F( TestCategory, update_double_mv_int ) { +TEST_F( TestCategory, update_mv_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double_int"); test_update_mv (); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas2_gemv.hpp b/unit_test/blas/Test_Blas2_gemv.hpp index 43f9187339..a0b15ce942 100644 --- a/unit_test/blas/Test_Blas2_gemv.hpp +++ b/unit_test/blas/Test_Blas2_gemv.hpp @@ -137,30 +137,40 @@ int test_gemv(const char* mode) { #if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemv_float ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_float"); test_gemv ("N"); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemv_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double"); test_gemv ("N"); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemv_complex_double ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_complex_double"); test_gemv,Kokkos::complex,Kokkos::complex,TestExecSpace> ("N"); + Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F( TestCategory, gemv_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_int"); test_gemv ("N"); + Kokkos::Profiling::popRegion(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, gemv_double_int ) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double_int"); test_gemv ("N"); + Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas2_team_gemv.hpp b/unit_test/blas/Test_Blas2_team_gemv.hpp index d2252b49c5..f0dd70d453 100644 --- a/unit_test/blas/Test_Blas2_team_gemv.hpp +++ b/unit_test/blas/Test_Blas2_team_gemv.hpp @@ -87,7 +87,7 @@ namespace Test { char trans = mode[0]; //KokkosBlas::gemv(mode,a,A,x,b,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamGemm", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::gemv(teamMember, trans, a, Kokkos::subview(A,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N),Kokkos::ALL()), x, b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N))); } ); @@ -98,7 +98,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::gemv(mode,a,A,c_x,b,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamGemm", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::gemv(teamMember, trans, a, Kokkos::subview(A,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N),Kokkos::ALL()), c_x, b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N))); } ); @@ -109,7 +109,7 @@ namespace Test { Kokkos::deep_copy(b_y,b_org_y); //KokkosBlas::gemv(mode,a,c_A,c_x,b,y); - Kokkos::parallel_for( policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { + Kokkos::parallel_for( "KokkosBlas::Test::TeamGemm", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::gemv(teamMember, trans, a, Kokkos::subview(c_A,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N),Kokkos::ALL()), c_x, b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N))); } ); From d8b61392b99bb377f6a66930de76592433507001 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 13:04:20 -0700 Subject: [PATCH 20/47] KokkosBatched - add profiling decoration --- unit_test/batched/Test_Batched_SerialGemm.hpp | 11 +++++- unit_test/batched/Test_Batched_SerialGemv.hpp | 11 +++++- .../batched/Test_Batched_SerialInverseLU.hpp | 34 +++++++++++++++-- unit_test/batched/Test_Batched_SerialLU.hpp | 11 +++++- .../batched/Test_Batched_SerialMatUtil.hpp | 17 +++++++-- .../batched/Test_Batched_SerialSolveLU.hpp | 33 +++++++++++++++-- unit_test/batched/Test_Batched_SerialTrsm.hpp | 11 +++++- unit_test/batched/Test_Batched_SerialTrsv.hpp | 9 +++++ unit_test/batched/Test_Batched_TeamGemm.hpp | 11 +++++- unit_test/batched/Test_Batched_TeamGemv.hpp | 11 +++++- .../batched/Test_Batched_TeamInverseLU.hpp | 37 +++++++++++++++++-- unit_test/batched/Test_Batched_TeamLU.hpp | 12 +++++- .../batched/Test_Batched_TeamMatUtil.hpp | 16 +++++++- .../batched/Test_Batched_TeamSolveLU.hpp | 34 +++++++++++++++-- unit_test/batched/Test_Batched_TeamTrsm.hpp | 12 +++++- unit_test/batched/Test_Batched_TeamTrsv.hpp | 12 +++++- 16 files changed, 257 insertions(+), 25 deletions(-) diff --git a/unit_test/batched/Test_Batched_SerialGemm.hpp b/unit_test/batched/Test_Batched_SerialGemm.hpp index 6919654952..f4ddb9be47 100644 --- a/unit_test/batched/Test_Batched_SerialGemm.hpp +++ b/unit_test/batched/Test_Batched_SerialGemm.hpp @@ -53,8 +53,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialGemm"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _c.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_SerialGemv.hpp b/unit_test/batched/Test_Batched_SerialGemv.hpp index 02daaced93..d03461c4ab 100644 --- a/unit_test/batched/Test_Batched_SerialGemv.hpp +++ b/unit_test/batched/Test_Batched_SerialGemv.hpp @@ -51,8 +51,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialGemv"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); Kokkos::RangePolicy policy(0, _c.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_SerialInverseLU.hpp b/unit_test/batched/Test_Batched_SerialInverseLU.hpp index f195349216..34296a2124 100644 --- a/unit_test/batched/Test_Batched_SerialInverseLU.hpp +++ b/unit_test/batched/Test_Batched_SerialInverseLU.hpp @@ -60,8 +60,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialInverseLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); Kokkos::RangePolicy policy(0, _c.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -87,8 +96,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialInverseLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -114,8 +132,18 @@ namespace Test { inline void run() { + typedef typename AViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialInverseLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str() ); Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::InverseLUFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } }; diff --git a/unit_test/batched/Test_Batched_SerialLU.hpp b/unit_test/batched/Test_Batched_SerialLU.hpp index dbe3bb44f1..b7b2e8665c 100644 --- a/unit_test/batched/Test_Batched_SerialLU.hpp +++ b/unit_test/batched/Test_Batched_SerialLU.hpp @@ -37,8 +37,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/Test_Batched_SerialMatUtil.hpp index 02eca5ac5c..d7de734ddc 100644 --- a/unit_test/batched/Test_Batched_SerialMatUtil.hpp +++ b/unit_test/batched/Test_Batched_SerialMatUtil.hpp @@ -69,11 +69,22 @@ namespace Test { inline int run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialMatUtil"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name_work_tag = ( std::is_same::value ? "::KokkosBatched" : + std::is_same::value ? "::Naive" : "::UnknownWorkTag"); + std::string name_test_id = ( TestID == BatchedSet ? "Set" : + TestID == BatchedScale ? "Scale" : "UnknownTest"); + std::string name = name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); return 0; - //MD 08/2017 NOTE: compilation was failing with werror. - //I added dummy return. } }; diff --git a/unit_test/batched/Test_Batched_SerialSolveLU.hpp b/unit_test/batched/Test_Batched_SerialSolveLU.hpp index a482eafbee..2b014b9ac7 100644 --- a/unit_test/batched/Test_Batched_SerialSolveLU.hpp +++ b/unit_test/batched/Test_Batched_SerialSolveLU.hpp @@ -60,8 +60,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialSolveLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _c.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -87,8 +96,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialSolveLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -114,8 +132,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialSolveLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::SolveLUFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_SerialTrsm.hpp b/unit_test/batched/Test_Batched_SerialTrsm.hpp index 079aeca5ea..bd2cf660c7 100644 --- a/unit_test/batched/Test_Batched_SerialTrsm.hpp +++ b/unit_test/batched/Test_Batched_SerialTrsm.hpp @@ -54,8 +54,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialTrsm"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _b.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_SerialTrsv.hpp b/unit_test/batched/Test_Batched_SerialTrsv.hpp index ee45e3896e..cdaf7bf645 100644 --- a/unit_test/batched/Test_Batched_SerialTrsv.hpp +++ b/unit_test/batched/Test_Batched_SerialTrsv.hpp @@ -52,8 +52,17 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialTrsv"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _b.extent(0)); Kokkos::parallel_for(policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_TeamGemm.hpp b/unit_test/batched/Test_Batched_TeamGemm.hpp index 66b1eea131..b1dba62565 100644 --- a/unit_test/batched/Test_Batched_TeamGemm.hpp +++ b/unit_test/batched/Test_Batched_TeamGemm.hpp @@ -58,9 +58,18 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamGemm"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); const int league_size = _c.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_TeamGemv.hpp b/unit_test/batched/Test_Batched_TeamGemv.hpp index b331d5941b..97b45bd3f2 100644 --- a/unit_test/batched/Test_Batched_TeamGemv.hpp +++ b/unit_test/batched/Test_Batched_TeamGemv.hpp @@ -56,9 +56,18 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialGemm"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); const int league_size = _c.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_TeamInverseLU.hpp b/unit_test/batched/Test_Batched_TeamInverseLU.hpp index bd63b07db3..18a055626c 100644 --- a/unit_test/batched/Test_Batched_TeamInverseLU.hpp +++ b/unit_test/batched/Test_Batched_TeamInverseLU.hpp @@ -67,9 +67,19 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamInverseLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _c.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -100,9 +110,20 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamInverseLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + + const int league_size = _a.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -130,9 +151,19 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamInverseLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _a.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::InverseLUFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_TeamLU.hpp b/unit_test/batched/Test_Batched_TeamLU.hpp index 0b7654edeb..53bdf56b89 100644 --- a/unit_test/batched/Test_Batched_TeamLU.hpp +++ b/unit_test/batched/Test_Batched_TeamLU.hpp @@ -43,9 +43,19 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _a.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_TeamMatUtil.hpp b/unit_test/batched/Test_Batched_TeamMatUtil.hpp index f75c930116..2fd5461573 100644 --- a/unit_test/batched/Test_Batched_TeamMatUtil.hpp +++ b/unit_test/batched/Test_Batched_TeamMatUtil.hpp @@ -74,9 +74,23 @@ namespace Test { inline int run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialMatUtil"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name_work_tag = ( std::is_same::value ? "::KokkosBatched" : + std::is_same::value ? "::Naive" : "::UnknownWorkTag"); + std::string name_test_id = ( TestID == BatchedSet ? "Set" : + TestID == BatchedScale ? "Scale" : "UnknownTest"); + std::string name = name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _a.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); return 0; } diff --git a/unit_test/batched/Test_Batched_TeamSolveLU.hpp b/unit_test/batched/Test_Batched_TeamSolveLU.hpp index 22ce33721e..8b701d2109 100644 --- a/unit_test/batched/Test_Batched_TeamSolveLU.hpp +++ b/unit_test/batched/Test_Batched_TeamSolveLU.hpp @@ -67,9 +67,18 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamSolveLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); const int league_size = _c.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -100,9 +109,18 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamSolveLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); const int league_size = _a.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; @@ -130,9 +148,19 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamSolveLU"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _a.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for((name+"::SolveLU").c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_TeamTrsm.hpp b/unit_test/batched/Test_Batched_TeamTrsm.hpp index 89272cb956..8bca273d44 100644 --- a/unit_test/batched/Test_Batched_TeamTrsm.hpp +++ b/unit_test/batched/Test_Batched_TeamTrsm.hpp @@ -59,9 +59,19 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamTrsm"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _b.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/batched/Test_Batched_TeamTrsv.hpp b/unit_test/batched/Test_Batched_TeamTrsv.hpp index 5ac7f3aa63..f1fd84a141 100644 --- a/unit_test/batched/Test_Batched_TeamTrsv.hpp +++ b/unit_test/batched/Test_Batched_TeamTrsv.hpp @@ -57,9 +57,19 @@ namespace Test { inline void run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamTrsv"); + std::string name_value_type = ( std::is_same::value ? "::Float" : + std::is_same::value ? "::Double" : + std::is_same >::value ? "::ComplexFloat" : + std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion( name.c_str() ); + const int league_size = _b.extent(0); Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); } }; From 0f638816f5ddc8c1c82a4d605522077a7240c191 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 13:08:16 -0700 Subject: [PATCH 21/47] KokkosBatched - add profiling decoration --- unit_test/batched/Test_Batched_TeamInverseLU.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_test/batched/Test_Batched_TeamInverseLU.hpp b/unit_test/batched/Test_Batched_TeamInverseLU.hpp index 18a055626c..cefd97b732 100644 --- a/unit_test/batched/Test_Batched_TeamInverseLU.hpp +++ b/unit_test/batched/Test_Batched_TeamInverseLU.hpp @@ -151,7 +151,7 @@ namespace Test { inline void run() { - typedef typename ViewType::value_type value_type; + typedef typename AViewType::value_type value_type; std::string name_region("KokkosBatched::Test::TeamInverseLU"); std::string name_value_type = ( std::is_same::value ? "::Float" : std::is_same::value ? "::Double" : From 0684400483d9a5f13948c424a4ad10685245c712 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 13:35:30 -0700 Subject: [PATCH 22/47] KokkosSparse - add profiling unit test --- unit_test/batched/Test_Batched_SerialTrsv.hpp | 2 +- unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp | 2 +- unit_test/sparse/Test_Sparse_replaceSumInto.hpp | 2 +- unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp | 2 +- unit_test/sparse/Test_Sparse_spmv.hpp | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/unit_test/batched/Test_Batched_SerialTrsv.hpp b/unit_test/batched/Test_Batched_SerialTrsv.hpp index cdaf7bf645..9ee0e6c294 100644 --- a/unit_test/batched/Test_Batched_SerialTrsv.hpp +++ b/unit_test/batched/Test_Batched_SerialTrsv.hpp @@ -61,7 +61,7 @@ namespace Test { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion( name.c_str() ); Kokkos::RangePolicy policy(0, _b.extent(0)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; diff --git a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp index 6bfd32081f..b6b4b7b304 100644 --- a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp @@ -358,7 +358,7 @@ testBlockCrsMatrix () result_view_type d_results("d_results"); auto h_results = Kokkos::create_mirror_view( d_results ); - Kokkos::parallel_for( Kokkos::RangePolicy(0, 1), Test::TestFunctor< block_crs_matrix_type, result_view_type>( A, d_results ) ); + Kokkos::parallel_for( "KokkosSparse::Test::BlockCrsMatrix", Kokkos::RangePolicy(0, 1), Test::TestFunctor< block_crs_matrix_type, result_view_type>( A, d_results ) ); Kokkos::deep_copy( h_results, d_results ); diff --git a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp index f1ea08efc7..49011e7536 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp @@ -113,7 +113,7 @@ namespace { // (anonymous) typedef Kokkos::RangePolicy policy_type; ::Test::ModifyEvenNumberedRows functor (A, replace, sorted, atomic); - Kokkos::parallel_for (policy_type (0, A.numRows ()), functor); + Kokkos::parallel_for ( "KokkosSparse::Test::ReplaceSumInto", policy_type (0, A.numRows ()), functor); } template diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp index cb7171cb12..913896a37b 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp @@ -177,7 +177,7 @@ namespace { // (anonymous) functor_type functor (A, replace, sorted, atomic); ordinal_type numModified = 0; policy_type range (0, A.numRows ()); - Kokkos::parallel_reduce (std::string ("ModifyEntries"), range, functor, numModified); + Kokkos::parallel_reduce ("KokkosSparse::Test::ModifyEntries", range, functor, numModified); const ordinal_type numEntShouldModify = static_cast (numEntToModify) <= A.numCols () ? diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 7c8c68aca0..2a636c74c0 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -108,11 +108,11 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, //KokkosKernels::Impl::print_1Dview(y); typedef Kokkos::Details::ArithTraits AT; int num_errors = 0; - Kokkos::parallel_reduce("KokkosKernels::UnitTests::spmv" + Kokkos::parallel_reduce("KokkosSparse::Test::spmv" ,my_exec_space(0, y.extent(0)) ,fSPMV(expected_y,y,eps) ,num_errors); - if(num_errors>0) printf("KokkosKernels::UnitTests::spmv: %i errors of %i with params: %lf %lf\n", + if(num_errors>0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", num_errors,y.extent_int(0),AT::abs(alpha),AT::abs(beta)); EXPECT_TRUE(num_errors==0); } @@ -146,11 +146,11 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto auto y_spmv = Kokkos::subview (y, Kokkos::ALL (), i); int num_errors = 0; - Kokkos::parallel_reduce("KokkosKernels::UnitTests::spmv_mv" + Kokkos::parallel_reduce("KokkosSparse::Test::spmv_mv" ,my_exec_space(0,y_i.extent(0)) ,fSPMV(y_i, y_spmv, eps) ,num_errors); - if(num_errors>0) printf("KokkosKernels::UnitTests::spmv_mv: %i errors of %i for mv %i\n", + if(num_errors>0) printf("KokkosSparse::Test::spmv_mv: %i errors of %i for mv %i\n", num_errors,y_i.extent_int(0),i); EXPECT_TRUE(num_errors==0); } From a9c39c1ee285b11d0809806469b5a2dcfc44ddf4 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 14:13:59 -0700 Subject: [PATCH 23/47] KokkosKernels - add profiling label for parallel patterns --- src/common/KokkosKernels_PrintUtils.hpp | 2 +- src/common/KokkosKernels_SimpleUtils.hpp | 16 +++--- src/common/KokkosKernels_SparseUtils.hpp | 56 +++++++++---------- src/common/KokkosKernels_Utils.hpp | 46 +++++++-------- src/common/KokkosKernels_VectorUtils.hpp | 6 +- .../impl/KokkosGraph_Distance2Color_impl.hpp | 6 +- .../impl/KokkosGraph_GraphColor_impl.hpp | 29 ++++------ src/sparse/KokkosSparse_spadd.hpp | 20 +++---- 8 files changed, 87 insertions(+), 94 deletions(-) diff --git a/src/common/KokkosKernels_PrintUtils.hpp b/src/common/KokkosKernels_PrintUtils.hpp index 3101478b5c..6d1574b2ff 100644 --- a/src/common/KokkosKernels_PrintUtils.hpp +++ b/src/common/KokkosKernels_PrintUtils.hpp @@ -85,7 +85,7 @@ inline void kk_get_histogram( in_lno_view_t in_view, out_lno_view_t histogram /*must be initialized with 0s*/){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, in_elements), Histogram(in_view, histogram)); + Kokkos::parallel_for( "KokkosKernels::Common::GetHistogram", my_exec_space(0, in_elements), Histogram(in_view, histogram)); MyExecSpace::fence(); } diff --git a/src/common/KokkosKernels_SimpleUtils.hpp b/src/common/KokkosKernels_SimpleUtils.hpp index 15f4e59c68..7ca1ae22f4 100644 --- a/src/common/KokkosKernels_SimpleUtils.hpp +++ b/src/common/KokkosKernels_SimpleUtils.hpp @@ -115,7 +115,7 @@ struct InclusiveParallelPrefixSum{ template inline void kk_exclusive_parallel_prefix_sum(typename view_t::value_type num_elements, view_t arr){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan( "KokkosKernels::PrefixSum", my_exec_space(0, num_elements), ExclusiveParallelPrefixSum(arr)); + Kokkos::parallel_scan( "KokkosKernels::Common::PrefixSum", my_exec_space(0, num_elements), ExclusiveParallelPrefixSum(arr)); } @@ -130,7 +130,7 @@ inline void kk_exclusive_parallel_prefix_sum(typename view_t::value_type num_ele template void kk_inclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan( "KokkosKernels::PrefixSum", my_exec_space(0, num_elements), InclusiveParallelPrefixSum(arr)); + Kokkos::parallel_scan( "KokkosKernels::Common::PrefixSum", my_exec_space(0, num_elements), InclusiveParallelPrefixSum(arr)); } template @@ -172,7 +172,7 @@ struct DiffReductionFunctor{ template inline void kk_reduce_diff_view(size_t num_elements, view_t smaller, view2_t bigger, typename view_t::non_const_value_type & reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0, num_elements), DiffReductionFunctor(smaller, bigger), reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), DiffReductionFunctor(smaller, bigger), reduction); } template @@ -190,7 +190,7 @@ struct DiffReductionFunctorP{ template inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, const it *bigger, it & reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0, num_elements), DiffReductionFunctorP(smaller, bigger), reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), DiffReductionFunctorP(smaller, bigger), reduction); } @@ -204,13 +204,13 @@ inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, c template inline void kk_reduce_view(size_t num_elements, view_t arr, typename view_t::value_type & reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0, num_elements), ReductionFunctor(arr), reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceView", my_exec_space(0, num_elements), ReductionFunctor(arr), reduction); } template inline void kk_reduce_view2(size_t num_elements, view_t arr, size_t & reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0, num_elements), ReductionFunctor2(arr), reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceView2", my_exec_space(0, num_elements), ReductionFunctor2(arr), reduction); } template::mag_type> @@ -248,7 +248,7 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps){ typedef Kokkos::RangePolicy my_exec_space; size_t issame = 0; - Kokkos::parallel_reduce( my_exec_space(0,num_elements), + Kokkos::parallel_reduce( "KokkosKernels::Common::IsIdenticalView", my_exec_space(0,num_elements), IsIdenticalFunctor(view1, view2, eps), issame); MyExecSpace::fence(); if (issame > 0){ @@ -296,7 +296,7 @@ struct ReduceMaxFunctor{ template void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &max_reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( "KokkosKernels::FindMax", my_exec_space(0,num_elements), ReduceMaxFunctor(view_to_reduce), max_reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceMax", my_exec_space(0,num_elements), ReduceMaxFunctor(view_to_reduce), max_reduction); } diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 484daa9605..ab0df8ae63 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -491,10 +491,10 @@ inline void kk_transpose_graph( if (use_dynamic_scheduling){ - Kokkos::parallel_for( d_count_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeGraph::DynamicSchedule::S0", d_count_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); } else { - Kokkos::parallel_for( count_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeGraph::StaticSchedule::S0", count_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); } MyExecSpace::fence(); @@ -506,10 +506,10 @@ inline void kk_transpose_graph( if (use_dynamic_scheduling){ - Kokkos::parallel_for( fill_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeGraph::DynamicSchedule::S1", fill_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); } else { - Kokkos::parallel_for( d_fill_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeGraph::StaticSchedule::S1", d_fill_tp_t(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm); } MyExecSpace::fence(); } @@ -683,7 +683,7 @@ void kk_create_reverse_map( frsf frm (forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); - Kokkos::parallel_for (my_cnt_exec_space (0, num_forward_elements) , frm); + Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::NonAtomic::S0", my_cnt_exec_space (0, num_forward_elements) , frm); MyExecSpace::fence(); @@ -692,12 +692,12 @@ void kk_create_reverse_map( (tmp_reverse_size + 1, tmp_color_xadj); MyExecSpace::fence(); - Kokkos::parallel_for ( + Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::NonAtomic::S1", my_exec_space (0, num_reverse_elements + 1) , StridedCopy1 (tmp_color_xadj, reverse_map_xadj, scale_size)); MyExecSpace::fence(); - Kokkos::parallel_for (my_fill_exec_space (0, num_forward_elements) , frm); + Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::NonAtomic::S2",my_fill_exec_space (0, num_forward_elements) , frm); MyExecSpace::fence(); } else @@ -713,7 +713,7 @@ void kk_create_reverse_map( rmp_functor_type frm (forward_map, tmp_color_xadj, reverse_map_adj); - Kokkos::parallel_for (my_cnt_exec_space (0, num_forward_elements) , frm); + Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::Atomic::S0", my_cnt_exec_space (0, num_forward_elements) , frm); MyExecSpace::fence(); //kk_inclusive_parallel_prefix_sum(num_reverse_elements + 1, reverse_map_xadj); @@ -724,7 +724,7 @@ void kk_create_reverse_map( Kokkos::deep_copy (reverse_map_xadj, tmp_color_xadj); MyExecSpace::fence(); - Kokkos::parallel_for (my_fill_exec_space (0, num_forward_elements) , frm); + Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::Atomic::S1", my_fill_exec_space (0, num_forward_elements) , frm); MyExecSpace::fence(); } } @@ -823,7 +823,7 @@ inline size_t kk_is_d1_coloring_valid( struct ColorChecker cc(num_rows, xadj, adj, v_colors, team_work_chunk_size); size_t num_conf = 0; - Kokkos::parallel_reduce( dynamic_team_policy(num_rows / team_work_chunk_size + 1 , + Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValie", dynamic_team_policy(num_rows / team_work_chunk_size + 1 , suggested_team_size, vector_size), cc, num_conf); MyExecSpace::fence(); @@ -1288,10 +1288,10 @@ void kk_get_lower_triangle_count_parallel( if (use_dynamic_scheduling){ - Kokkos::parallel_for( d_count_tp_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); + Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleCount::DynamicSchedule", d_count_tp_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); } else { - Kokkos::parallel_for( count_tp_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); + Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleCount::StaticSchedule", count_tp_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); } ExecutionSpace::fence(); } @@ -1387,7 +1387,7 @@ void kk_sort_by_row_size_parallel( SortItem * num_elements = &(vnum_elements[0]); - Kokkos::parallel_for( my_exec_space(0, nv), + Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S0", my_exec_space(0, nv), KOKKOS_LAMBDA(const lno_t& row) { lno_t row_size = in_xadj[row+1] - in_xadj[row]; num_elements[row].size = row_size; @@ -1398,19 +1398,19 @@ void kk_sort_by_row_size_parallel( std::less()); if (sort_decreasing_order == 1){ - Kokkos::parallel_for( my_exec_space(0, nv), + Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S1", my_exec_space(0, nv), KOKKOS_LAMBDA(const lno_t& row) { new_indices[num_elements[row].id] = row; }); } else if (sort_decreasing_order == 0){ - Kokkos::parallel_for( my_exec_space(0, nv), + Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S2", my_exec_space(0, nv), KOKKOS_LAMBDA(const lno_t& row) { new_indices[num_elements[row].id] = nv - row - 1; }); } else { - Kokkos::parallel_for( my_exec_space(0, nv), + Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S3", my_exec_space(0, nv), KOKKOS_LAMBDA(const lno_t& row) { if (row & 1){ new_indices[num_elements[row].id] = nv - (row + 1) / 2; @@ -1475,10 +1475,10 @@ void kk_get_lower_triangle_fill_parallel( if (use_dynamic_scheduling){ - Kokkos::parallel_for( d_fill_p_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); + Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleFill::DynamicSchedule", d_fill_p_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); } else { - Kokkos::parallel_for( fill_p_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); + Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleFill::StaticSchedule", fill_p_t(nv / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm); } ExecutionSpace::fence(); } @@ -1848,7 +1848,7 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( //const lno_t nr = in_rowmap.extent(0) - 1; typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for(my_exec_space(0, ne + 1), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S0", my_exec_space(0, ne + 1), KOKKOS_LAMBDA(const lno_t& i) { out_rowmap[i] = i * 2; }); @@ -1862,7 +1862,7 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( out_entries = out_cols_view_t(Kokkos::ViewAllocateWithoutInitializing("LL"), 2 * ne); //TODO MAKE IT WITH TEAMS. - Kokkos::parallel_for(my_exec_space(0, nr), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S1", my_exec_space(0, nr), KOKKOS_LAMBDA(const size_type& row) { size_type begin = in_rowmap(row); lno_t row_size = in_rowmap(row + 1) - begin; @@ -1918,7 +1918,7 @@ void kk_create_incidence_matrix_from_lower_triangle( kk_exclusive_parallel_prefix_sum(nr+1, out_rowmap); exec_space::fence(); - Kokkos::parallel_for(my_exec_space(0, nr + 1), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S0", my_exec_space(0, nr + 1), KOKKOS_LAMBDA(const lno_t& i) { out_rowmap[i] += in_lower_rowmap[i]; }); @@ -1928,7 +1928,7 @@ void kk_create_incidence_matrix_from_lower_triangle( out_entries = out_cols_view_t(Kokkos::ViewAllocateWithoutInitializing("LL"), 2*ne); - Kokkos::parallel_for(my_exec_space(0, nr), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S1", my_exec_space(0, nr), KOKKOS_LAMBDA(const size_type& row) { size_type begin = in_lower_rowmap(row); lno_t row_size = in_lower_rowmap(row + 1) - begin; @@ -1945,7 +1945,7 @@ void kk_create_incidence_matrix_from_lower_triangle( } }); exec_space::fence(); - Kokkos::parallel_for(my_exec_space(0, ne), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S2", my_exec_space(0, ne), KOKKOS_LAMBDA(const size_type& edge_ind) { lno_t col = in_lower_entries[edge_ind]; typedef typename std::remove_reference< decltype( out_rowmap_copy(0) ) >::type atomic_incr_type; @@ -2017,13 +2017,13 @@ void kk_create_incidence_matrix_from_original_matrix( out_row_map_view_t out_rowmap_copy (Kokkos::ViewAllocateWithoutInitializing("tmp"), nr+1); //out_rowmap = out_row_map_view_t("LL", nr+1); - Kokkos::parallel_for(my_exec_space(0, nr+1), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S0", my_exec_space(0, nr+1), KOKKOS_LAMBDA(const lno_t& i) { out_rowmap_copy[i] = in_rowmap[i]; }); if (sort_decreasing_order){ - Kokkos::parallel_for(my_exec_space(0, nr), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S1", my_exec_space(0, nr), KOKKOS_LAMBDA(const size_type& row) { size_type begin = in_rowmap(row); lno_t row_size = in_rowmap(row + 1) - begin; @@ -2055,7 +2055,7 @@ void kk_create_incidence_matrix_from_original_matrix( } else { - Kokkos::parallel_for(my_exec_space(0, nr), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S2", my_exec_space(0, nr), KOKKOS_LAMBDA(const size_type& row) { size_type begin = in_rowmap(row); lno_t row_size = in_rowmap(row + 1) - begin; @@ -2087,7 +2087,7 @@ void kk_create_incidence_matrix_from_original_matrix( //out_rowmap = out_row_map_view_t("LL", nr+1); - Kokkos::parallel_for(my_exec_space(0, nr+1), + Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S3", my_exec_space(0, nr+1), KOKKOS_LAMBDA(const lno_t& i) { out_rowmap[i] = in_rowmap[i]; }); @@ -2130,7 +2130,7 @@ template void kk_reduce_numrows_larger_than_threshold(size_t num_elements, view_type view_to_reduce, typename view_type::const_value_type threshold, typename view_type::non_const_value_type &sum_reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0,num_elements), ReduceLargerRowCount(view_to_reduce, threshold), sum_reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", my_exec_space(0,num_elements), ReduceLargerRowCount(view_to_reduce, threshold), sum_reduction); } } diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp index 2fc1635d18..406e67090f 100644 --- a/src/common/KokkosKernels_Utils.hpp +++ b/src/common/KokkosKernels_Utils.hpp @@ -708,14 +708,14 @@ struct LinearInitialization{ template void linear_init(typename array_type::value_type num_elements, array_type arr){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, num_elements), LinearInitialization(arr)); + Kokkos::parallel_for( "KokkosKernels::Common::LinearInit", my_exec_space(0, num_elements), LinearInitialization(arr)); } template void remove_zeros_in_xadj_vector(typename forward_array_type::value_type num_elements, forward_array_type arr){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_scan( my_exec_space(0, num_elements), PropogataMaxValstoZeros(arr)); + Kokkos::parallel_scan( "KokkosKernels::Common::RemoveZerosInXadjVector",my_exec_space(0, num_elements), PropogataMaxValstoZeros(arr)); } @@ -885,18 +885,18 @@ void create_reverse_map( tmp_color_xadj, multiply_shift_for_scale, division_shift_for_bucket); - Kokkos::parallel_for ("KokkosKernels::Impl::ReverseMapScaleInit",my_exec_space (0, num_forward_elements) , rmi); + Kokkos::parallel_for ("KokkosKernels::Common::ReverseMapScaleInit",my_exec_space (0, num_forward_elements) , rmi); MyExecSpace::fence(); inclusive_parallel_prefix_sum(tmp_reverse_size + 1, tmp_color_xadj); MyExecSpace::fence(); - Kokkos::parallel_for ("KokkosKernels::Impl::StridedCopy",my_exec_space (0, num_reverse_elements + 1) , StridedCopy(tmp_color_xadj, reverse_map_xadj, scale_size)); + Kokkos::parallel_for ("KokkosKernels::Common::StridedCopy",my_exec_space (0, num_reverse_elements + 1) , StridedCopy(tmp_color_xadj, reverse_map_xadj, scale_size)); MyExecSpace::fence(); Fill_Reverse_Scale_Map frm (forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); - Kokkos::parallel_for ("KokkosKernels::Impl::FillReverseMap",my_exec_space (0, num_forward_elements) , frm); + Kokkos::parallel_for ("KokkosKernels::Common::FillReverseMap",my_exec_space (0, num_forward_elements) , frm); MyExecSpace::fence(); } else @@ -906,7 +906,7 @@ void create_reverse_map( Reverse_Map_Init rmi(forward_map, reverse_map_xadj); - Kokkos::parallel_for ("KokkosKernels::Impl::ReverseMapInit",my_exec_space (0, num_forward_elements) , rmi); + Kokkos::parallel_for ("KokkosKernels::Common::ReverseMapInit",my_exec_space (0, num_forward_elements) , rmi); MyExecSpace::fence(); //print_1Dview(reverse_map_xadj); @@ -916,7 +916,7 @@ void create_reverse_map( Kokkos::deep_copy (tmp_color_xadj, reverse_map_xadj); MyExecSpace::fence(); Fill_Reverse_Map frm (forward_map, tmp_color_xadj, reverse_map_adj); - Kokkos::parallel_for ("KokkosKernels::Impl::FillReverseMap",my_exec_space (0, num_forward_elements) , frm); + Kokkos::parallel_for ("KokkosKernels::Common::FillReverseMap",my_exec_space (0, num_forward_elements) , frm); MyExecSpace::fence(); } } @@ -953,7 +953,7 @@ void permute_vector( ){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Impl::PermuteVector", my_exec_space(0,num_elements), + Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", my_exec_space(0,num_elements), PermuteVector(old_vector, new_vector, old_to_new_index_map)); } @@ -997,7 +997,7 @@ void permute_block_vector( ){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for("KokkosKernels::Impl::PermuteVector", my_exec_space(0,num_elements), + Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", my_exec_space(0,num_elements), PermuteBlockVector(block_size, old_vector, new_vector, old_to_new_index_map)); } @@ -1152,7 +1152,7 @@ void symmetrize_and_get_lower_diagonal_edge_list( #endif //std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:" << vector_size << " tsm:" << teamSizeMax<< std::endl; - Kokkos::parallel_for( + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S0", team_policy(num_rows_to_symmetrize / teamSizeMax + 1 , teamSizeMax, vector_size), fse/*, num_symmetric_edges*/); MyExecSpace::fence(); @@ -1211,7 +1211,7 @@ void symmetrize_and_get_lower_diagonal_edge_list( teamSizeMax); #endif - Kokkos::parallel_for( + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", team_policy(num_rows_to_symmetrize / teamSizeMax + 1 , teamSizeMax, vector_size), FSCH); MyExecSpace::fence(); @@ -1296,7 +1296,7 @@ void symmetrize_graph_symbolic_hashmap( teamSizeMax); #endif - Kokkos::parallel_for( + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", team_policy(num_rows_to_symmetrize / teamSizeMax + 1 , teamSizeMax, vector_size), fse/*, num_symmetric_edges*/); MyExecSpace::fence(); @@ -1352,7 +1352,7 @@ void symmetrize_graph_symbolic_hashmap( teamSizeMax); #endif - Kokkos::parallel_for( + Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", team_policy(num_rows_to_symmetrize / teamSizeMax + 1 , teamSizeMax, vector_size), FSCH); MyExecSpace::fence(); @@ -1391,7 +1391,7 @@ void copy_view( from_vector from, to_vector to){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0,num_elements), CopyView(from, to)); + Kokkos::parallel_for( "KokkosKernels::Common::CopyView", my_exec_space(0,num_elements), CopyView(from, to)); } @@ -1474,7 +1474,7 @@ struct ReduceSumFunctor{ template void view_reduce_sum(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &sum_reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0,num_elements), ReduceSumFunctor(view_to_reduce), sum_reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceSum", my_exec_space(0,num_elements), ReduceSumFunctor(view_to_reduce), sum_reduction); } @@ -1525,7 +1525,7 @@ void kk_view_reduce_max_row_size(const size_t num_rows, const size_type *rowmap_view_ends, size_type &max_row_size){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0,num_rows), + Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceMaxRowSize", my_exec_space(0,num_rows), ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), max_row_size); } @@ -1567,7 +1567,7 @@ struct ReduceMaxRowFunctor{ template void view_reduce_maxsizerow(size_t num_rows, view_type rowmap_view, typename view_type::non_const_value_type &max_reduction){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_reduce( my_exec_space(0,num_rows), ReduceMaxRowFunctor(rowmap_view), max_reduction); + Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceMaxSizeRow", my_exec_space(0,num_rows), ReduceMaxRowFunctor(rowmap_view), max_reduction); } @@ -1604,7 +1604,7 @@ template bool isSame(size_t num_elements, view_type1 view1, view_type2 view2){ typedef Kokkos::RangePolicy my_exec_space; int issame = 1; - Kokkos::parallel_reduce( my_exec_space(0,num_elements), IsEqualFunctor(view1, view2), issame); + Kokkos::parallel_reduce( "KokkosKernels::Common::isSame", my_exec_space(0,num_elements), IsEqualFunctor(view1, view2), issame); MyExecSpace::fence(); return issame; } @@ -1772,7 +1772,7 @@ void transpose_matrix( int vector_size = get_suggested_vector__size(num_rows, nnz, get_exec_space_type()); Kokkos::Impl::Timer timer1; - Kokkos::parallel_for( tcp_t(num_rows / team_row_work_size + 1 , Kokkos::AUTO_t(), vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeMatrix::S0", tcp_t(num_rows / team_row_work_size + 1 , Kokkos::AUTO_t(), vector_size), tm); MyExecSpace::fence(); exclusive_parallel_prefix_sum(num_cols+1, t_xadj); @@ -1781,7 +1781,7 @@ void transpose_matrix( MyExecSpace::fence(); timer1.reset(); - Kokkos::parallel_for( tfp_t(num_rows / team_row_work_size + 1 , Kokkos::AUTO_t(), vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeMatrix::S1", tfp_t(num_rows / team_row_work_size + 1 , Kokkos::AUTO_t(), vector_size), tm); MyExecSpace::fence(); } @@ -1850,7 +1850,7 @@ void transpose_graph2( int vector_size = get_suggested_vector__size(num_rows, nnz, get_exec_space_type()); Kokkos::Impl::Timer timer1; - Kokkos::parallel_for( tcp_t(num_rows , Kokkos::AUTO_t(), vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeGraph2::S0", tcp_t(num_rows , Kokkos::AUTO_t(), vector_size), tm); MyExecSpace::fence(); exclusive_parallel_prefix_sum(num_cols+1, t_xadj); @@ -1859,7 +1859,7 @@ void transpose_graph2( MyExecSpace::fence(); timer1.reset(); - Kokkos::parallel_for( tfp_t(num_rows , Kokkos::AUTO_t(), vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::TransposeGraph::S1", tfp_t(num_rows , Kokkos::AUTO_t(), vector_size), tm); MyExecSpace::fence(); @@ -1912,7 +1912,7 @@ void init_view_withscalar(typename in_row_view_t::size_type num_elements, in_row int vector_size = 1; Kokkos::Impl::Timer timer1; - Kokkos::parallel_for( tcp_t(num_elements / chunk_size + 1 , team_size, vector_size), tm); + Kokkos::parallel_for( "KokkosKernels::Common::InitViewWithScalar", tcp_t(num_elements / chunk_size + 1 , team_size, vector_size), tm); MyExecSpace::fence(); } diff --git a/src/common/KokkosKernels_VectorUtils.hpp b/src/common/KokkosKernels_VectorUtils.hpp index d79ed1369b..534e179174 100644 --- a/src/common/KokkosKernels_VectorUtils.hpp +++ b/src/common/KokkosKernels_VectorUtils.hpp @@ -120,7 +120,7 @@ inline void kk_a_times_x_plus_b( out_array_t out_arr, in_array_t in_arr, scalar_1 a, scalar_2 b){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, num_elements), + Kokkos::parallel_for( "KokkosKernels::Common::ATimesXPlusB", my_exec_space(0, num_elements), A_times_X_plus_B(out_arr, in_arr, a, b)); } @@ -134,7 +134,7 @@ inline void kk_a_times_x_plus_b( template inline void kk_modular_view(typename in_array_type::value_type num_elements, out_array_type out_arr, in_array_type in_arr, int mod_factor_){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, num_elements), ModularView(out_arr, in_arr, mod_factor_)); + Kokkos::parallel_for( "KokkosKernels::Common::ModularView", my_exec_space(0, num_elements), ModularView(out_arr, in_arr, mod_factor_)); } @@ -144,7 +144,7 @@ void kk_copy_vector( size_t num_elements, from_vector from, to_vector to){ typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0,num_elements), CopyVectorFunctor(from, to)); + Kokkos::parallel_for( "KokkosKernels::Common::CopyVector", my_exec_space(0,num_elements), CopyVectorFunctor(from, to)); } } diff --git a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp index 1810520cbe..63ec637d7d 100644 --- a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -360,7 +360,7 @@ class GraphColorD2 nnz_lno_temp_work_view_t current_vertexList = nnz_lno_temp_work_view_t(Kokkos::ViewAllocateWithoutInitializing("vertexList"), this->nv); // init conflictlist sequentially. - Kokkos::parallel_for(my_exec_space(0, this->nv), functorInitList(current_vertexList)); + Kokkos::parallel_for("KokkosGraph::Distance2Color::ColorGraphD2", my_exec_space(0, this->nv), functorInitList(current_vertexList)); // Next iteratons's conflictList nnz_lno_temp_work_view_t next_iteration_recolorList; @@ -513,7 +513,7 @@ class GraphColorD2 chunkSize_ ); - Kokkos::parallel_for(my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); + Kokkos::parallel_for("KokkosGraph::Distance2Color::ColorGraphGreedy", my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc); } // colorGreedy (end) @@ -573,7 +573,7 @@ class GraphColorD2 current_vertexList_, next_iteration_recolorList_, next_iteration_recolorListLength_); - Kokkos::parallel_reduce(my_exec_space(0, current_vertexListLength_), conf, output_numUncolored); + Kokkos::parallel_reduce("KokkosGraph::Distance2Color::FindConflicts", my_exec_space(0, current_vertexListLength_), conf, output_numUncolored); } } else diff --git a/src/graph/impl/KokkosGraph_GraphColor_impl.hpp b/src/graph/impl/KokkosGraph_GraphColor_impl.hpp index f1ef7e38b4..48cebc199f 100644 --- a/src/graph/impl/KokkosGraph_GraphColor_impl.hpp +++ b/src/graph/impl/KokkosGraph_GraphColor_impl.hpp @@ -1207,24 +1207,24 @@ class GraphColor_VB:public GraphColor _conflictlist == 0){ if (this->_use_color_set == 0 || this->_use_color_set == 2){ functorFindConflicts_No_Conflist conf( this->nv, xadj_, adj_, vertex_colors_); - Kokkos::parallel_reduce(my_exec_space(0, current_vertexListLength_), conf, numUncolored); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseA", my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { functorFindConflicts_No_Conflist_IMP conf(this->nv, xadj_, adj_,vertex_colors_, vertex_color_set_); - Kokkos::parallel_reduce(my_exec_space(0, current_vertexListLength_), conf, numUncolored); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseB", my_exec_space(0, current_vertexListLength_), conf, numUncolored); } } else if (this->_conflictlist == 2){ //IF PPS if (this->_use_color_set == 0 || this->_use_color_set == 2){ // Check for conflicts. Compute numUncolored == numConflicts. functorFindConflicts_PPS conf(this->nv, xadj_, adj_,vertex_colors_,current_vertexList_,next_iteration_recolorList_); - Kokkos::parallel_reduce(my_exec_space(0, current_vertexListLength_), conf, numUncolored); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseC", my_exec_space(0, current_vertexListLength_), conf, numUncolored); } else { functorFindConflicts_PPS_IMP conf(this->nv, xadj_, adj_,vertex_colors_, vertex_color_set_, current_vertexList_,next_iteration_recolorList_); - Kokkos::parallel_reduce(my_exec_space(0, current_vertexListLength_), conf, numUncolored); + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseD", my_exec_space(0, current_vertexListLength_), conf, numUncolored); } @@ -2940,8 +2940,7 @@ class GraphColor_EB:public GraphColor nv << " i:" << i << " num_work_edges:" << num_work_edges<< std::endl; //conflict detection mark conflicts as color 0. //update their bans - Kokkos::parallel_for( - "KokkosGraph::GraphColoring::HalfEdgeMarkConflicts", + Kokkos::parallel_for("KokkosGraph::GraphColoring::HalfEdgeMarkConflicts", my_exec_space(0,num_work_edges), halfedge_mark_conflicts ( _kok_src, _kok_dst, @@ -2966,8 +2965,7 @@ class GraphColor_EB:public GraphColor 0) - Kokkos::parallel_reduce( - "KokkosGraph::GraphColoring::HalfEdgeConflictsCount", + Kokkos::parallel_reduce("KokkosGraph::GraphColoring::HalfEdgeConflictsCount", my_exec_space(0, num_work_edges), halfedge_conflict_count( _kok_src, _kok_dst, @@ -3007,16 +3005,14 @@ class GraphColor_EB:public GraphColor countEntries(a_rowmap, a_entries, b_rowmap, b_entries, c_rowcounts); - Kokkos::parallel_for(range_type(0, nrows), countEntries); + Kokkos::parallel_for("KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", range_type(0, nrows), countEntries); execution_space::fence(); //get c_rowmap as cumulative sum parallel_prefix_sum prefix(c_rowcounts, c_rowmap); - Kokkos::parallel_scan(range_type(0, nrows + 1), prefix); + Kokkos::parallel_scan("KokkosSparse::SpAdd:Symbolic::InputSorted::PrefixSum", range_type(0, nrows + 1), prefix); execution_space::fence(); } else @@ -452,11 +452,11 @@ namespace Experimental { clno_row_view_t_ c_rowcounts_upperbound("C row counts upper bound", nrows); UnsortedEntriesUpperBound countEntries(a_rowmap, b_rowmap, c_rowcounts_upperbound); - Kokkos::parallel_for(range_type(0, nrows), countEntries); + Kokkos::parallel_for("KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntires", range_type(0, nrows), countEntries); execution_space::fence(); //get (temporary) c_rowmap as cumulative sum parallel_prefix_sum prefix(c_rowcounts_upperbound, c_rowmap_upperbound); - Kokkos::parallel_scan(range_type(0, nrows + 1), prefix); + Kokkos::parallel_scan("KokkosSparse::SpAdd:Symbolic::InputNotSorted::PrefixSum", range_type(0, nrows + 1), prefix); //compute uncompressed entries of C (just indices, no scalars) execution_space::fence(); @@ -472,12 +472,12 @@ namespace Experimental { UnmergedSumFunctor unmergedSum( a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - Kokkos::parallel_for(range_type(0, nrows), unmergedSum); + Kokkos::parallel_for("KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", range_type(0, nrows), unmergedSum); execution_space::fence(); //sort the unmerged sum SortEntriesFunctor sortEntries(c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - Kokkos::parallel_for(range_type(0, nrows), sortEntries); + Kokkos::parallel_for("KokkosSparse::SpAdd:Symbolic::InputNotSorted::SortEntries", range_type(0, nrows), sortEntries); execution_space::fence(); clno_nnz_view_t_ a_pos("A entry positions", a_entries.extent(0)); clno_nnz_view_t_ b_pos("B entry positions", b_entries.extent(0)); @@ -486,11 +486,11 @@ namespace Experimental { clno_row_view_t_ c_rowcounts("C row counts", nrows); MergeEntriesFunctor mergeEntries(a_rowmap, b_rowmap, c_rowmap_upperbound, c_rowcounts, c_entries_uncompressed, ab_perm, a_pos, b_pos); - Kokkos::parallel_for(range_type(0, nrows), mergeEntries); + Kokkos::parallel_for("KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries", range_type(0, nrows), mergeEntries); execution_space::fence(); //compute actual c_rowmap parallel_prefix_sum prefix(c_rowcounts, c_rowmap); - Kokkos::parallel_scan(range_type(0, nrows + 1), prefix); + Kokkos::parallel_scan("KokkosSparse::SpAdd:Symbolic::InputNotSorted::PrefixSumSecond", range_type(0, nrows + 1), prefix); execution_space::fence(); } addHandle->set_a_b_pos(a_pos, b_pos); @@ -722,7 +722,7 @@ namespace Experimental { ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_, ascalar_t_, bscalar_t_> sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, c_entries, a_values, b_values, c_values, alpha, beta); - Kokkos::parallel_for(range_type(0, nrows), sortedNumeric); + Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", range_type(0, nrows), sortedNumeric); execution_space::fence(); } else @@ -733,7 +733,7 @@ namespace Experimental { ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_, ascalar_t_, bscalar_t_> unsortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, c_entries, a_values, b_values, c_values, alpha, beta, addHandle->get_a_pos(), addHandle->get_b_pos()); - Kokkos::parallel_for(range_type(0, nrows), unsortedNumeric); + Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", range_type(0, nrows), unsortedNumeric); execution_space::fence(); } addHandle->set_call_numeric(); From 28f1be7e5991306d81e9a4c77a94c17a356d7df7 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 14:20:32 -0700 Subject: [PATCH 24/47] KokkosKernels - profiling labels --- src/blas/impl/KokkosBlas1_scal_spec.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_scal_spec.hpp b/src/blas/impl/KokkosBlas1_scal_spec.hpp index 936d15dbce..edeaa00107 100644 --- a/src/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/src/blas/impl/KokkosBlas1_scal_spec.hpp @@ -144,7 +144,7 @@ struct Scal: " "XV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -203,7 +203,7 @@ struct Scal { "AV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Scal<2-D>: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n",typeid(RMV).name(),typeid(AV).name(),typeid(XMV).name()); @@ -250,7 +250,7 @@ struct Scal: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) From 0863e6c6609aaa787232180efa83eeb25646aef4 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 14:24:30 -0700 Subject: [PATCH 25/47] KokkosBlas - profiling label --- src/blas/impl/KokkosBlas1_abs_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_axpby_spec.hpp | 6 +++--- src/blas/impl/KokkosBlas1_dot_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_mult_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_nrm1_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_nrm2_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_nrm2w_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_nrminf_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_reciprocal_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_sum_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas1_update_spec.hpp | 4 ++-- src/blas/impl/KokkosBlas2_gemv_spec.hpp | 2 +- src/blas/impl/KokkosBlas3_gemm_spec.hpp | 2 +- 13 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_abs_spec.hpp b/src/blas/impl/KokkosBlas1_abs_spec.hpp index e973aae492..19318d7811 100644 --- a/src/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/src/blas/impl/KokkosBlas1_abs_spec.hpp @@ -127,7 +127,7 @@ struct Abs "RMV is not rank 1."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Abs<1-D>: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -163,7 +163,7 @@ struct Abs { "RMV is not rank 2."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Abs<2-D>: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); diff --git a/src/blas/impl/KokkosBlas1_axpby_spec.hpp b/src/blas/impl/KokkosBlas1_axpby_spec.hpp index f955d92fe3..bbeb13d65c 100644 --- a/src/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/src/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -179,7 +179,7 @@ struct Axpby "X and Y must have the same rank."); static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby::axpby: " "X and Y must have rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XMV).name(),typeid(BV).name(),typeid(YMV).name()); @@ -248,7 +248,7 @@ struct Axpby::axpby: " "X and Y must have rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XV).name(),typeid(BV).name(),typeid(YV).name()); diff --git a/src/blas/impl/KokkosBlas1_dot_spec.hpp b/src/blas/impl/KokkosBlas1_dot_spec.hpp index 1c5322f8c3..4d1db931c0 100644 --- a/src/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/src/blas/impl/KokkosBlas1_dot_spec.hpp @@ -177,7 +177,7 @@ struct Dot "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n",typeid(XV).name(),typeid(YV).name()); @@ -214,7 +214,7 @@ struct Dot: " "RV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(XV).name(),typeid(YV).name()); diff --git a/src/blas/impl/KokkosBlas1_mult_spec.hpp b/src/blas/impl/KokkosBlas1_mult_spec.hpp index 3cbc52397d..224369d0e0 100644 --- a/src/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/src/blas/impl/KokkosBlas1_mult_spec.hpp @@ -167,7 +167,7 @@ struct Mult "X, and Y must have the rank 2."); static_assert (AV::rank == 1, "KokkosBlas::Impl::Mult::mult: " "AV must have rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -219,7 +219,7 @@ struct Mult static_assert ((int) XV::rank == (int) YV::rank && (int) AV::rank == 1, "KokkosBlas::Impl::Mult::mult: " "X, Y, and Z must have rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n",typeid(YV).name(),typeid(AV).name(),typeid(XV).name()); diff --git a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp index 095ace371e..6dfc5220f2 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -131,7 +131,7 @@ struct Nrm1 "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm1<1-D>: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -175,7 +175,7 @@ struct Nrm1 { printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); } #endif - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]"); const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); if (numRows < static_cast (INT_MAX) && diff --git a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp index 254b9ca047..7e68692b8e 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -131,7 +131,7 @@ struct Nrm2 "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm2<1-D>: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -167,7 +167,7 @@ struct Nrm2 { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Nrm2<2-D>: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); diff --git a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp index cd52c518b3..aedd422010 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -127,7 +127,7 @@ struct Nrm2w "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm2w<1-D>: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -163,7 +163,7 @@ struct Nrm2w { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Nrm2w<2-D>: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); diff --git a/src/blas/impl/KokkosBlas1_nrminf_spec.hpp b/src/blas/impl/KokkosBlas1_nrminf_spec.hpp index 458666594f..80aca6469c 100644 --- a/src/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -131,7 +131,7 @@ struct NrmInf "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::NrmInf<1-D>: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -167,7 +167,7 @@ struct NrmInf { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::NrmInf<2-D>: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); diff --git a/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 3eee2fc174..4d6c042336 100644 --- a/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -127,7 +127,7 @@ struct Reciprocal "RMV is not rank 1."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Reciprocal<1-D>: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); @@ -163,7 +163,7 @@ struct Reciprocal { "RMV is not rank 2."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Reciprocal<2-D>: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name()); diff --git a/src/blas/impl/KokkosBlas1_sum_spec.hpp b/src/blas/impl/KokkosBlas1_sum_spec.hpp index 7476fae98f..e2e2882f21 100644 --- a/src/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/src/blas/impl/KokkosBlas1_sum_spec.hpp @@ -131,7 +131,7 @@ struct Sum "RMV is not rank 0."); static_assert (XMV::rank == 1, "KokkosBlas::Impl::Sum<1-D>: " "XMV is not rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -168,7 +168,7 @@ struct Sum { "RV is not rank 1."); static_assert (XMV::rank == 2, "KokkosBlas::Impl::Sum<2-D>: " "XMV is not rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name()); diff --git a/src/blas/impl/KokkosBlas1_update_spec.hpp b/src/blas/impl/KokkosBlas1_update_spec.hpp index 872b99aa32..ede3d82488 100644 --- a/src/blas/impl/KokkosBlas1_update_spec.hpp +++ b/src/blas/impl/KokkosBlas1_update_spec.hpp @@ -162,7 +162,7 @@ struct Update "X, Y, and Z must have the same rank."); static_assert (ZMV::rank == 2, "KokkosBlas::Impl::Update::update: " "XMV, YMV, and ZMV must have rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -257,7 +257,7 @@ struct Update "X, Y, and Z must have the same rank."); static_assert (ZV::rank == 1, "KokkosBlas::Impl::Update::update: " "XV, YV, and ZV must have rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]"); #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n",typeid(XV).name(),typeid(YV).name(),typeid(ZV).name()); diff --git a/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_gemv_spec.hpp index b7fe8e3456..ff3e5e584e 100644 --- a/src/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -127,7 +127,7 @@ struct GEMV { "XViewType must have rank 1."); static_assert (static_cast (YViewType::rank) == 1, "YViewType must have rank 1."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::gemv[ETI]":"KokkosBlas::gemv[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::gemv[ETI]":"KokkosBlas::gemv[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp index b40df1e5bb..b1f633127d 100644 --- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -128,7 +128,7 @@ struct GEMM { static_assert (static_cast (CViewType::rank) == 2, "CViewType must have rank 2."); - Kokkos::Profiling::pushRegion(eti_spec_avail?"KokkosBlas::gemm[ETI]":"KokkosBlas::gemm[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::gemm[ETI]":"KokkosBlas::gemm[noETI]"); // Figure out Scalar Types typedef typename AViewType::non_const_value_type ScalarA; typedef typename BViewType::non_const_value_type ScalarB; From 72c9bbbe17e7bb0b76aeadb75dccabb326bd5145 Mon Sep 17 00:00:00 2001 From: crtrott Date: Mon, 5 Nov 2018 14:28:12 -0700 Subject: [PATCH 26/47] Update master_history.txt --- src/blas/impl/KokkosBlas1_dot_mv_impl.hpp | 44 +++++++++++------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp index 63eb5bb880..b0ddf40c23 100644 --- a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -415,7 +415,7 @@ struct MV_V_Dot_Invoke_Impl typedef MV_V_Dot_Functor op_type; constexpr bool reverseOrder = false; op_type op (r, X, Y, reverseOrder); - Kokkos::parallel_reduce (range_type (0, numRows), op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::2_1", range_type (0, numRows), op, r); } }; @@ -439,7 +439,7 @@ struct MV_V_Dot_Invoke_Impl typedef MV_V_Dot_Functor op_type; constexpr bool reverseOrder = true; op_type op (r, Y, X, reverseOrder); - Kokkos::parallel_reduce (range_type (0, numRows), op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::1_2", range_type (0, numRows), op, r); } }; @@ -488,7 +488,7 @@ MV_Dot_Invoke (const RV& r, const XMV& X, const YMV& Y) auto r_cur = Kokkos::subview (r, std::make_pair (j, j+8)); MV_Dot_Right_FunctorUnroll op (r_cur, X_cur, Y_cur); - Kokkos::parallel_reduce (policy, op, r_cur); + Kokkos::parallel_reduce ("KokkosBlas::Dot::2_2::8", policy, op, r_cur); } for ( ; j + 4 <= numCols; j += 4) { auto X_cur = Kokkos::subview (X, Kokkos::ALL (), std::make_pair (j, j+4)); @@ -496,7 +496,7 @@ MV_Dot_Invoke (const RV& r, const XMV& X, const YMV& Y) auto r_cur = Kokkos::subview (r, std::make_pair (j, j+4)); MV_Dot_Right_FunctorUnroll op (r_cur, X_cur, Y_cur); - Kokkos::parallel_reduce (policy, op, r_cur); + Kokkos::parallel_reduce ("KokkosBlas::Dot::2_2::4", policy, op, r_cur); } for ( ; j < numCols; ++j) { // RV needs to turn 0-D, and XMV and YMV need to turn 1-D. @@ -508,90 +508,90 @@ MV_Dot_Invoke (const RV& r, const XMV& X, const YMV& Y) typedef decltype (y_cur) YMV1D; DotFunctor op(x_cur, y_cur); - Kokkos::parallel_reduce (policy, op, r_cur); + Kokkos::parallel_reduce ("KokkosBlas::Dot::2_2::1", policy, op, r_cur); } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_DOT > 2 if (numCols > 16) { MV_Dot_Right_FunctorVector op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::16Above", policy, op, r); } else { switch (numCols) { case 16: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::16", policy, op, r); break; } case 15: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::15", policy, op, r); break; } case 14: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::14", policy, op, r); break; } case 13: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::13", policy, op, r); break; } case 12: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::12", policy, op, r); break; } case 11: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::11", policy, op, r); break; } case 10: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::10", policy, op, r); break; } case 9: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::9", policy, op, r); break; } case 8: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::8", policy, op, r); break; } case 7: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::7", policy, op, r); break; } case 6: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::6", policy, op, r); break; } case 5: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::5", policy, op, r); break; } case 4: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::4", policy, op, r); break; } case 3: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::3", policy, op, r); break; } case 2: { MV_Dot_Right_FunctorUnroll op (r, X, Y); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::2", policy, op, r); break; } case 1: { @@ -605,7 +605,7 @@ MV_Dot_Invoke (const RV& r, const XMV& X, const YMV& Y) typedef V_Dot_Functor op_type; op_type op (r_0, X_0, Y_0); - Kokkos::parallel_reduce (policy, op, r_0); + Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::1", policy, op, r_0); break; } } // switch From 24bad8ab5a37a4d26196a20b38fb4ed6a38ecef3 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 15:18:00 -0700 Subject: [PATCH 27/47] KokkosKernels - profiling labels ... it is a lot... Christian, you said it will take a couple of hours .... It took more than that . --- src/blas/impl/KokkosBlas1_abs_impl.hpp | 8 +-- src/blas/impl/KokkosBlas1_axpby_impl.hpp | 32 +++++----- src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 64 +++++++++---------- src/blas/impl/KokkosBlas1_mult_impl.hpp | 16 ++--- src/blas/impl/KokkosBlas1_nrm1_impl.hpp | 4 +- src/blas/impl/KokkosBlas1_nrm2_impl.hpp | 4 +- src/blas/impl/KokkosBlas1_nrm2w_impl.hpp | 4 +- src/blas/impl/KokkosBlas1_nrminf_impl.hpp | 4 +- src/blas/impl/KokkosBlas1_reciprocal_impl.hpp | 8 +-- src/blas/impl/KokkosBlas1_scal_impl.hpp | 8 +-- src/blas/impl/KokkosBlas1_scal_mv_impl.hpp | 16 ++--- src/blas/impl/KokkosBlas1_sum_impl.hpp | 4 +- 12 files changed, 86 insertions(+), 86 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_abs_impl.hpp b/src/blas/impl/KokkosBlas1_abs_impl.hpp index ef26674e0a..33c713aeb5 100644 --- a/src/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/src/blas/impl/KokkosBlas1_abs_impl.hpp @@ -199,11 +199,11 @@ MV_Abs_Generic (const RMV& R, const XMV& X) if (R == X) { // if R and X are the same (alias one another) MV_AbsSelf_Functor op (R); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Abs::S0", policy, op); } else { MV_Abs_Functor op (R, X); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Abs::S1", policy, op); } } @@ -227,11 +227,11 @@ V_Abs_Generic (const RV& R, const XV& X) if (R == X) { // if R and X are the same (alias one another) V_AbsSelf_Functor op (R); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Abs::S2", policy, op); } else { V_Abs_Functor op (R, X); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Abs::S3", policy, op); } } diff --git a/src/blas/impl/KokkosBlas1_axpby_impl.hpp b/src/blas/impl/KokkosBlas1_axpby_impl.hpp index 66fc4f5782..bc402caf59 100644 --- a/src/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/src/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -352,26 +352,26 @@ Axpby_Generic (const AV& av, const XV& x, if (a == 0 && b == 0) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S0", policy, op); return; } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 0 && b == -1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S1", policy, op); return; } if (a == 0 && b == 1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S2", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 0 && b == 2) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S3", policy, op); return; } @@ -379,43 +379,43 @@ Axpby_Generic (const AV& av, const XV& x, // a == -1 if (a == -1 && b == 0) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S4", policy, op); return; } if (a == -1 && b == -1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S5", policy, op); return; } if (a == -1 && b == 1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S6", policy, op); return; } if (a == -1 && b == 2) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S7", policy, op); return; } // a == 1 if (a == 1 && b == 0) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S8", policy, op); return; } if (a == 1 && b == -1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S9", policy, op); return; } if (a == 1 && b == 1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S10", policy, op); return; } if (a == 1 && b == 2) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S11", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 @@ -423,26 +423,26 @@ Axpby_Generic (const AV& av, const XV& x, // a == 2 if (a == 2 && b == 0) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S12", policy, op); return; } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 2 && b == -1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S13", policy, op); return; } if (a == 2 && b == 1) { Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S14", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // a and b arbitrary (not -1, 0, or 1) Axpby_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::S15", policy, op); } } diff --git a/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index f33898f49a..8a3c8f9ef8 100644 --- a/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -996,26 +996,26 @@ Axpby_MV_Unrolled (const AV& av, const XMV& x, if (a == 0 && b == 0) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S0", policy, op); return; } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 0 && b == -1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S1", policy, op); return; } if (a == 0 && b == 1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S2", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY if (a == 0 && b == 2) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S3", policy, op); return; } @@ -1023,43 +1023,43 @@ Axpby_MV_Unrolled (const AV& av, const XMV& x, // a == -1 if (a == -1 && b == 0) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S4", policy, op); return; } if (a == -1 && b == -1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S5", policy, op); return; } if (a == -1 && b == 1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S6", policy, op); return; } if (a == -1 && b == 2) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S7", policy, op); return; } // a == 1 if (a == 1 && b == 0) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S8", policy, op); return; } if (a == 1 && b == -1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S9", policy, op); return; } if (a == 1 && b == 1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S10", policy, op); return; } if (a == 1 && b == 2) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S11", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 @@ -1067,26 +1067,26 @@ Axpby_MV_Unrolled (const AV& av, const XMV& x, // a == 2 if (a == 2 && b == 0) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S12", policy, op); return; } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 2 && b == -1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S13", policy, op); return; } if (a == 2 && b == 1) { Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S14", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // a and b arbitrary (not -1, 0, or 1) Axpby_MV_Unroll_Functor op (x, y, av, bv, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S15", policy, op); } // Invoke the "generic" (not unrolled) multivector functor that @@ -1134,26 +1134,26 @@ Axpby_MV_Generic (const AV& av, const XMV& x, if (a == 0 && b == 0) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S16", policy, op); return; } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 0 && b == -1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S17", policy, op); return; } if (a == 0 && b == 1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S18", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 0 && b == 2) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S19", policy, op); return; } @@ -1161,43 +1161,43 @@ Axpby_MV_Generic (const AV& av, const XMV& x, // a == -1 if (a == -1 && b == 0) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S20", policy, op); return; } if (a == -1 && b == -1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S21", policy, op); return; } if (a == -1 && b == 1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S22", policy, op); return; } if (a == -1 && b == 2) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S23", policy, op); return; } // a == 1 if (a == 1 && b == 0) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S24", policy, op); return; } if (a == 1 && b == -1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S25", policy, op); return; } if (a == 1 && b == 1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S26", policy, op); return; } if (a == 1 && b == 2) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S27", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 @@ -1205,26 +1205,26 @@ Axpby_MV_Generic (const AV& av, const XMV& x, // a == 2 if (a == 2 && b == 0) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S28", policy, op); return; } #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (a == 2 && b == -1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S29", policy, op); return; } if (a == 2 && b == 1) { Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S30", policy, op); return; } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 // a and b arbitrary (not -1, 0, or 1) Axpby_MV_Functor op (x, y, av, bv); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S31", policy, op); } // Compute any of the following, in a way optimized for X and Y diff --git a/src/blas/impl/KokkosBlas1_mult_impl.hpp b/src/blas/impl/KokkosBlas1_mult_impl.hpp index 8ade9b71cd..fcb53ae5e7 100644 --- a/src/blas/impl/KokkosBlas1_mult_impl.hpp +++ b/src/blas/impl/KokkosBlas1_mult_impl.hpp @@ -215,24 +215,24 @@ V_Mult_Generic (typename CV::const_value_type& c, if (ab == ATA::zero ()) { typedef V_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S0", policy, op); } else { typedef V_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S1", policy, op); } } else { // c != 0 if (ab == ATA::zero ()) { typedef V_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S2", policy, op); } else { typedef V_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S3", policy, op); } } } @@ -277,24 +277,24 @@ MV_Mult_Generic (typename CMV::const_value_type& c, if (ab == ATA::zero ()) { typedef MV_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S4", policy, op); } else { typedef MV_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S5", policy, op); } } else { // c != 0 if (ab == ATA::zero ()) { typedef MV_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S6", policy, op); } else { typedef MV_MultFunctor functor_type; functor_type op (c, C, ab, A, B); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Mult::S7", policy, op); } } } diff --git a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp index 064cfd1eb1..5ca1fe694a 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -230,7 +230,7 @@ V_Nrm1_Invoke (const RV& r, const XV& X) typedef V_Nrm1_Functor functor_type; functor_type op (X); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas1::Nrm1::S0", policy, op, r); } @@ -256,7 +256,7 @@ MV_Nrm1_Invoke (const RV& r, const XMV& X) else { typedef MV_Nrm1_Right_FunctorVector functor_type; functor_type op (X); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas1::Nrm1::S1", policy, op, r); } } diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp index 60a3f2cd9b..42d3b73518 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -256,7 +256,7 @@ V_Nrm2_Invoke (const RV& r, const XV& X, const bool& take_sqrt) typedef V_Nrm2_Functor functor_type; functor_type op (X, take_sqrt); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Nrm2::S0", policy, op, r); } @@ -282,7 +282,7 @@ MV_Nrm2_Invoke (const RV& r, const XMV& X, const bool& take_sqrt) else { typedef MV_Nrm2_Right_FunctorVector functor_type; functor_type op (X, take_sqrt); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Nrm2::S1", policy, op, r); } } diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp index d1c038c146..1a125b81df 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -257,7 +257,7 @@ V_Nrm2w_Invoke (const RV& r, const XV& X, const XV& W, const bool& take_sqrt) typedef V_Nrm2w_Functor functor_type; functor_type op (X, W, take_sqrt); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Nrm2w::S0", policy, op, r); } @@ -284,7 +284,7 @@ MV_Nrm2w_Invoke (const RV& r, const XMV& X, const XMV& W, const bool& take_sqrt) else { typedef MV_Nrm2w_Right_FunctorVector functor_type; functor_type op (X, W, take_sqrt); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Nrm2w::S1", policy, op, r); } } diff --git a/src/blas/impl/KokkosBlas1_nrminf_impl.hpp b/src/blas/impl/KokkosBlas1_nrminf_impl.hpp index 535aa16e74..c015d854e1 100644 --- a/src/blas/impl/KokkosBlas1_nrminf_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrminf_impl.hpp @@ -224,7 +224,7 @@ V_NrmInf_Invoke (const RV& r, const XV& X) typedef V_NrmInf_Functor functor_type; functor_type op (X); - Kokkos::parallel_reduce (policy, op, Kokkos::Max(r())); + Kokkos::parallel_reduce ("KokkosBlas::NrmInf::S0", policy, op, Kokkos::Max(r())); } @@ -262,7 +262,7 @@ MV_NrmInf_Invoke (const RV& r, const XMV& X) else { typedef MV_NrmInf_Right_FunctorVector functor_type; functor_type op (X); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::NrmInf::S1", policy, op, r); } } diff --git a/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp index e3f4193ef8..21f7c8967b 100644 --- a/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -199,11 +199,11 @@ MV_Reciprocal_Generic (const RMV& R, const XMV& X) if (R == X) { // if R and X are the same (alias one another) MV_ReciprocalSelf_Functor op (R); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Reciprocal::S0", policy, op); } else { MV_Reciprocal_Functor op (R, X); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Reciprocal::S1", policy, op); } } @@ -227,11 +227,11 @@ V_Reciprocal_Generic (const RV& R, const XV& X) if (R == X) { // if R and X are the same (alias one another) V_ReciprocalSelf_Functor op (R); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Reciprocal::S2", policy, op); } else { V_Reciprocal_Functor op (R, X); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Reciprocal::S3", policy, op); } } diff --git a/src/blas/impl/KokkosBlas1_scal_impl.hpp b/src/blas/impl/KokkosBlas1_scal_impl.hpp index d19ab7a2db..b7a24d53bb 100644 --- a/src/blas/impl/KokkosBlas1_scal_impl.hpp +++ b/src/blas/impl/KokkosBlas1_scal_impl.hpp @@ -188,23 +188,23 @@ V_Scal_Generic (const RV& r, const AV& av, const XV& x, if (a == 0) { V_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::S0", policy, op); return; } if (a == -1) { V_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::S1", policy, op); return; } if (a == 1) { V_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::S2", policy, op); return; } // a arbitrary (not -1, 0, or 1) V_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::S3", policy, op); } diff --git a/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp index f672b8fe58..da4bc97ea5 100644 --- a/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -371,21 +371,21 @@ MV_Scal_Unrolled (const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op (r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy (0, numRows); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S0", policy, op); return; } if (a == -1) { MV_Scal_Unroll_Functor op (r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy (0, numRows); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S1", policy, op); return; } if (a == 1) { MV_Scal_Unroll_Functor op (r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy (0, numRows); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S2", policy, op); return; } @@ -393,7 +393,7 @@ MV_Scal_Unrolled (const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op (r, x, av, startingColumn); const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy (0, numRows); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S3", policy, op); } // Invoke the "generic" (not unrolled) multivector functor that @@ -424,23 +424,23 @@ MV_Scal_Generic (const RVector& r, if (a == 0) { MV_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S4", policy, op); return; } if (a == -1) { MV_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S5", policy, op); return; } if (a == 1) { MV_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S6", policy, op); return; } // a arbitrary (not -1, 0, or 1) MV_Scal_Functor op (r, x, av, startingColumn); - Kokkos::parallel_for (policy, op); + Kokkos::parallel_for ("KokkosBlas::Scal::MV::S7", policy, op); } diff --git a/src/blas/impl/KokkosBlas1_sum_impl.hpp b/src/blas/impl/KokkosBlas1_sum_impl.hpp index d943f58e76..3ed261601a 100644 --- a/src/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/src/blas/impl/KokkosBlas1_sum_impl.hpp @@ -212,7 +212,7 @@ V_Sum_Invoke (const RV& r, const XV& X) typedef V_Sum_Functor functor_type; functor_type op (X); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Sum::S0", policy, op, r); } @@ -238,7 +238,7 @@ MV_Sum_Invoke (const RV& r, const XMV& X) else { typedef MV_Sum_Right_FunctorVector functor_type; functor_type op (X); - Kokkos::parallel_reduce (policy, op, r); + Kokkos::parallel_reduce ("KokkosBlas::Sum::S1", policy, op, r); } } From cc52e6c9ed2720a7c93256a622cdc2f20343dcb7 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 15:27:32 -0700 Subject: [PATCH 28/47] KokkosKernels - still .....profiling labels... --- test_common/KokkosBatched_Test_BlockCrs.hpp | 24 +++++++++------------ test_common/Test_Common_ArithTraits.hpp | 2 +- test_common/Test_Common_set_bit_count.hpp | 8 +++---- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/test_common/KokkosBatched_Test_BlockCrs.hpp b/test_common/KokkosBatched_Test_BlockCrs.hpp index e12f66a355..3bc5e023f8 100644 --- a/test_common/KokkosBatched_Test_BlockCrs.hpp +++ b/test_common/KokkosBatched_Test_BlockCrs.hpp @@ -116,8 +116,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const MemberType &member) const { const int ijbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) { const int ij = ijbeg + idx; if (ij < _ntridiag) { @@ -211,8 +210,7 @@ namespace KokkosBatched { ScratchViewType sA(member.team_scratch(_shmemlvl), VectorLength, _blocksize, _blocksize); const int ijbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) { const int ij = ijbeg + idx; if (ij < _ntridiag) { @@ -294,7 +292,7 @@ namespace KokkosBatched { case 0: { std::cout << "KokkosBatched::RangeTag::" << Gemm_AlgoTagType::name() << "\n"; const Kokkos::RangePolicy policy(0, _ntridiag); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::Op0", policy, *this); break; } #if defined(KOKKOS_ENABLE_CUDA) && defined(__KOKKOSBATCHED_TEST_ENABLE_CUDA__) @@ -322,7 +320,7 @@ namespace KokkosBatched { } const policy_type policy(_ntridiag, team_size, VectorLength); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::Op1", policy, *this); break; } case 2: { @@ -354,7 +352,7 @@ namespace KokkosBatched { } policy_type policy = policy_type(_ntridiag, team_size, VectorLength).set_scratch_size(_shmemlvl, Kokkos::PerTeam(per_team_scratch)); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::Op2", policy, *this); } break; } @@ -604,8 +602,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const MemberType &member) const { const int ijbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) { const int ij = ijbeg + idx; if (ij < _ntridiag) { @@ -704,8 +701,7 @@ namespace KokkosBatched { ScratchViewType s(member.team_scratch(_shmemlvl), VectorLength, _m, _blocksize); const int ijbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) { const int ij = ijbeg + idx; if (ij < _ntridiag) { @@ -825,7 +821,7 @@ namespace KokkosBatched { switch (op) { case 0: { const Kokkos::RangePolicy policy(0, _ntridiag); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op0", policy, *this); break; } #if defined(KOKKOS_ENABLE_CUDA) && defined(__KOKKOSBATCHED_TEST_ENABLE_CUDA__) @@ -850,7 +846,7 @@ namespace KokkosBatched { } const policy_type policy(_ntridiag, team_size, VectorLength); - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op1", policy, *this); break; } case 2: { @@ -879,7 +875,7 @@ namespace KokkosBatched { } policy_type policy = policy_type(_ntridiag, team_size, VectorLength).set_scratch_size(_shmemlvl, Kokkos::PerTeam(per_team_scratch));; - Kokkos::parallel_for(policy, *this); + Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op2", policy, *this); } break; } diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index 5f68650187..9564dbc6b9 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -1447,7 +1447,7 @@ int testArithTraitsOnDevice (std::ostream& out, const int verbose) using std::endl; typedef ArithTraitsTester functor_type; int success = 1; // output argument of parallel_reduce - Kokkos::parallel_reduce (1, functor_type (), success); + Kokkos::parallel_reduce ("KokkosKernels::Common::Test::ArithTraitsOnDevice", 1, functor_type (), success); if (success) { if (verbose) out << Kokkos::Details::ArithTraits::name () << " passed" << endl; diff --git a/test_common/Test_Common_set_bit_count.hpp b/test_common/Test_Common_set_bit_count.hpp index 63e6008d3b..2f233eff79 100644 --- a/test_common/Test_Common_set_bit_count.hpp +++ b/test_common/Test_Common_set_bit_count.hpp @@ -104,7 +104,7 @@ view_type get_array_bit_count(view_type view){ typename view_type::non_const_type out_view ("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, view.extent(0)),ppctest (view, out_view)); + Kokkos::parallel_for( "KokkosKernels::Common::Test::GetArrayBitCount", my_exec_space(0, view.extent(0)),ppctest (view, out_view)); Kokkos::fence(); return out_view; } @@ -116,7 +116,7 @@ view_type check_array_bit_count(view_type view){ typename view_type::non_const_type out_view ("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, view.extent(0)), ppccheck (view, out_view)); + Kokkos::parallel_for( "KokkosKernels::Common::Test::CheckArrayBitCount", my_exec_space(0, view.extent(0)), ppccheck (view, out_view)); Kokkos::fence(); return out_view; } @@ -165,7 +165,7 @@ view_type get_ffs(view_type view){ typename view_type::non_const_type out_view ("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, view.extent(0)), ffstest (view, out_view)); + Kokkos::parallel_for( "KokkosKernels::Common::Test::GetFFS", my_exec_space(0, view.extent(0)), ffstest (view, out_view)); Kokkos::fence(); return out_view; } @@ -177,7 +177,7 @@ view_type check_ffs(view_type view){ typename view_type::non_const_type out_view ("out", view.extent(0)); typedef Kokkos::RangePolicy my_exec_space; - Kokkos::parallel_for( my_exec_space(0, view.extent(0)), ffscheck (view, out_view)); + Kokkos::parallel_for( "KokkosKernels::Common::Test::CheckFFS", my_exec_space(0, view.extent(0)), ffscheck (view, out_view)); Kokkos::fence(); return out_view; } From bb59c4f74f62dab128feff81bb52c9d10a7026b8 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 11 Dec 2018 15:47:13 -0700 Subject: [PATCH 29/47] KokkosKernels - perf test labeling --- .../batched/KokkosBatched_Test_Gemm_Cuda.cpp | 25 ++++++++----------- .../batched/KokkosBatched_Test_Gemm_Host.hpp | 16 ++++++------ .../batched/KokkosBatched_Test_Gemv_Host.hpp | 12 ++++----- .../batched/KokkosBatched_Test_LU_Cuda.cpp | 8 +++--- .../batched/KokkosBatched_Test_LU_Host.hpp | 12 ++++----- .../batched/KokkosBatched_Test_Trsm_Cuda.cpp | 8 +++--- .../batched/KokkosBatched_Test_Trsm_Host.hpp | 7 +++--- perf_test/graph/KokkosGraph_run_triangle.hpp | 2 +- perf_test/sparse/spmv/Kokkos_SPMV.hpp | 2 +- .../sparse/spmv/Kokkos_SPMV_Inspector.hpp | 2 +- 10 files changed, 44 insertions(+), 50 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_Gemm_Cuda.cpp b/perf_test/batched/KokkosBatched_Test_Gemm_Cuda.cpp index 475ede4efe..3477120a2e 100644 --- a/perf_test/batched/KokkosBatched_Test_Gemm_Cuda.cpp +++ b/perf_test/batched/KokkosBatched_Test_Gemm_Cuda.cpp @@ -74,8 +74,7 @@ namespace KokkosBatched { void operator()(const TeamTagV1 &, const MemberType &member) const { const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) + member.team_rank()*VectorLength); - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { const int kk = kbeg + k; if (kk < int(_c.extent(0))) { @@ -93,8 +92,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, const MemberType &member) const { const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { const int kk = kbeg + k; if (kk < int(_c.extent(0))) { @@ -116,8 +114,7 @@ namespace KokkosBatched { ScratchViewType sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2)); const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { const int kk = kbeg + k; if (kk < int(_c.extent(0))) { @@ -142,14 +139,12 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &, const MemberType &member) const { const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { const int kk = kbeg + k; if (kk < int(_c.extent(0))) { const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); - Kokkos::parallel_for - (Kokkos::TeamThreadRange(member,0,m*n), + Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n), [&](const int &ij) { const int i = ij%m, j = ij/m; typename ViewType::non_const_value_type cval = 0; @@ -315,7 +310,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for("GEMM: RangePolicy version", policy, functor_type(a,b,c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", policy, functor_type(a,b,c)); DeviceSpaceType::fence(); const double t = timer.seconds(); @@ -382,7 +377,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for("GEMM: TeamPolicy version 1", policy,functor_type(a,b,c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", policy,functor_type(a,b,c)); DeviceSpaceType::fence(); const double t = timer.seconds(); @@ -455,7 +450,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for("GEMM: TeamPolicy version 2", policy, functor_type(a,b,c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", policy, functor_type(a,b,c)); DeviceSpaceType::fence(); const double t = timer.seconds(); @@ -532,7 +527,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for("GEMM: TeamPolicy version 3", policy, functor_type(a,b,c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, functor_type(a,b,c)); DeviceSpaceType::fence(); const double t = timer.seconds(); @@ -604,7 +599,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for("GEMM: TeamPolicy handmade", policy, functor_type(a,b,c)); + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, functor_type(a,b,c)); DeviceSpaceType::fence(); const double t = timer.seconds(); diff --git a/perf_test/batched/KokkosBatched_Test_Gemm_Host.hpp b/perf_test/batched/KokkosBatched_Test_Gemm_Host.hpp index 8634a25615..8e8a36d9fc 100644 --- a/perf_test/batched/KokkosBatched_Test_Gemm_Host.hpp +++ b/perf_test/batched/KokkosBatched_Test_Gemm_Host.hpp @@ -87,8 +87,8 @@ namespace KokkosBatched { amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, BlkSize); - Kokkos::parallel_for - (Kokkos::RangePolicy(0, N*VectorLength), + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::Pack", + Kokkos::RangePolicy(0, N*VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k/VectorLength, k1 = k%VectorLength; for (int i=0;i amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); - Kokkos::parallel_for - (Kokkos::RangePolicy(0, N*VectorLength), + Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", + Kokkos::RangePolicy(0, N*VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k/VectorLength, k1 = k%VectorLength; for (int i=0;i policy(0, N*VectorLength); - Kokkos::parallel_for - (policy, + Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", + policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto pp = Kokkos::subview(p, k, Kokkos::ALL()); @@ -295,8 +295,8 @@ namespace KokkosBatched { timer.reset(); Kokkos::RangePolicy policy(0, N); - Kokkos::parallel_for - (policy, + Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", + policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); diff --git a/perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp b/perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp index 2b8bf13fc2..d285ad8a34 100644 --- a/perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp +++ b/perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp @@ -477,7 +477,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for(policy, functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, functor_type(a, b)); DeviceSpaceType::fence(); const double t = timer.seconds(); @@ -540,7 +540,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for(policy, functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, functor_type(a, b)); DeviceSpaceType::fence(); const double t = timer.seconds(); @@ -612,7 +612,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for(policy, functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, functor_type(a, b)); DeviceSpaceType::fence(); const double t = timer.seconds(); @@ -689,7 +689,7 @@ namespace KokkosBatched { DeviceSpaceType::fence(); timer.reset(); - Kokkos::parallel_for(policy, functor_type(a, b)); + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, functor_type(a, b)); DeviceSpaceType::fence(); const double t = timer.seconds(); diff --git a/perf_test/batched/KokkosBatched_Test_Trsm_Host.hpp b/perf_test/batched/KokkosBatched_Test_Trsm_Host.hpp index 601eb70733..c345ae1ed8 100644 --- a/perf_test/batched/KokkosBatched_Test_Trsm_Host.hpp +++ b/perf_test/batched/KokkosBatched_Test_Trsm_Host.hpp @@ -153,8 +153,7 @@ namespace KokkosBatched { timer.reset(); Kokkos::RangePolicy policy(0, N*VectorLength); - Kokkos::parallel_for - (policy, + Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); @@ -600,8 +599,8 @@ namespace KokkosBatched { timer.reset(); Kokkos::RangePolicy policy(0, N); - Kokkos::parallel_for - (policy, + Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", + policy, KOKKOS_LAMBDA(const int k) { auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp index 046f81ff54..49339a4663 100644 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ b/perf_test/graph/KokkosGraph_run_triangle.hpp @@ -151,7 +151,7 @@ struct Flush { void run() { double sum = 0; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0,BufSize/sizeof(double)), *this, sum); + Kokkos::parallel_reduce("KokkosGraph::PerfTest::Flush", Kokkos::RangePolicy(0,BufSize/sizeof(double)), *this, sum); SpaceType::fence(); std::cout << "Flush sum:" << sum << std::endl; FILE *fp = fopen("/dev/null", "w"); diff --git a/perf_test/sparse/spmv/Kokkos_SPMV.hpp b/perf_test/sparse/spmv/Kokkos_SPMV.hpp index c790762ae8..aea860b1c2 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV.hpp @@ -186,7 +186,7 @@ void kk_matvec(AType A, XType x, YType y, int rows_per_thread, int team_size, in else policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); - Kokkos::parallel_for(policy,func); + Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV", policy,func); } diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp index 2a8c0b047a..f3131825a7 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp @@ -164,7 +164,7 @@ void kk_inspector_matvec(AType A, XType x, YType y, int rows_per_thread, int tea else policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); - Kokkos::parallel_for(policy,func); + Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV_Inspector", policy,func); } From 545737c4bfae4c2e41c5944d2ce32273dc581a9e Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 11 Dec 2018 16:08:56 -0700 Subject: [PATCH 30/47] Fix equivalent view check --- src/blas/impl/KokkosBlas1_abs_impl.hpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_abs_impl.hpp b/src/blas/impl/KokkosBlas1_abs_impl.hpp index 2da5e976e7..f08e7e2154 100644 --- a/src/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/src/blas/impl/KokkosBlas1_abs_impl.hpp @@ -197,9 +197,7 @@ MV_Abs_Generic (const RMV& R, const XMV& X) const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy (0, numRows); - if(std::is_same::value && - std::is_same::value && - ((void*) (R.data()) == (void*) (X.data()))) { // if R and X are the same (alias one another) + if((void*) (R.data()) == (void*) (X.data())) { // if R and X are the same (alias one another) MV_AbsSelf_Functor op (R); Kokkos::parallel_for (policy, op); } @@ -227,9 +225,7 @@ V_Abs_Generic (const RV& R, const XV& X) const SizeType numRows = X.extent(0); Kokkos::RangePolicy policy (0, numRows); - if(std::is_same::value && - std::is_same::value && - ((void*) (R.data()) == (void*) (X.data()))) { // if R and X are the same (alias one another) + if((void*) (R.data()) == (void*) (X.data())) { // if R and X are the same (alias one another) V_AbsSelf_Functor op (R); Kokkos::parallel_for (policy, op); } From fd3898ff22e832a8fdc49ce36503b655ef02f218 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 18 Dec 2018 11:54:29 -0700 Subject: [PATCH 31/47] test_all_sandia: update ibm module list Load gcc/7.2.0 when loading xl to resolve issues trying to build/test with c++14 on white. Additional cxxflags must be passed to use gcc/7.2.0 headers for proper c++14 support. This matches changes made in kokkos/kokkos#1945 --- scripts/test_all_sandia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index fc8ebb8a73..103abc37bc 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -242,7 +242,7 @@ elif [ "$MACHINE" = "white" ]; then export SLURM_TASKS_PER_NODE=32 BASE_MODULE_LIST="/" - IBM_MODULE_LIST="/xl/" + IBM_MODULE_LIST="/xl/,gcc/7.2.0" CUDA_MODULE_LIST="/,gcc/7.2.0,ibm/xl/16.1.0" module load netlib/3.8.0/gcc From 291684400371151e662ce35efe4829a5964cfb55 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 18 Dec 2018 12:49:00 -0700 Subject: [PATCH 32/47] test_all_sandia: add cuda/10 support for testing on white --- scripts/test_all_sandia | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 103abc37bc..26eedfefdd 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -244,6 +244,7 @@ elif [ "$MACHINE" = "white" ]; then BASE_MODULE_LIST="/" IBM_MODULE_LIST="/xl/,gcc/7.2.0" CUDA_MODULE_LIST="/,gcc/7.2.0,ibm/xl/16.1.0" + CUDA10_MODULE_LIST="/,gcc/7.4.0,ibm/xl/16.1.0" module load netlib/3.8.0/gcc export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a" @@ -265,6 +266,7 @@ elif [ "$MACHINE" = "white" ]; then "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi From d38c0d7ef329157a4746363fbbe7e429d09f2972 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 19 Dec 2018 17:12:18 -0700 Subject: [PATCH 33/47] integration scripts: add script for cuda_serial + complex testing --- ..._jenkins_script_cuda_serial_complex-kepler | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 scripts/trilinos-integration/white_run_jenkins_script_cuda_serial_complex-kepler diff --git a/scripts/trilinos-integration/white_run_jenkins_script_cuda_serial_complex-kepler b/scripts/trilinos-integration/white_run_jenkins_script_cuda_serial_complex-kepler new file mode 100755 index 0000000000..603fafdeec --- /dev/null +++ b/scripts/trilinos-integration/white_run_jenkins_script_cuda_serial_complex-kepler @@ -0,0 +1,89 @@ +#!/bin/bash -el +ulimit -c 0 + +KOKKOSKERNELS_BRANCH=$1 +TRILINOS_UPDATE_BRANCH=$2 +TRILINOS_PRISTINE_BRANCH=$3 +KOKKOS_UPDATE_BRANCH=$4 + +if [ -z $KOKKOSKERNELS_BRANCH ] +then + KOKKOSKERNELS_BRANCH=develop +fi + +if [ -z $TRILINOS_UPDATE_BRANCH ] +then + TRILINOS_UPDATE_BRANCH=develop +fi + +if [ -z $TRILINOS_PRISTINE_BRANCH ] +then + TRILINOS_PRISTINE_BRANCH=develop +fi + +module load devpack/20180521/openmpi/2.1.2/gcc/7.2.0/cuda/9.2.88 +module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.9.6 cmake/3.12.3 +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=ON +export JENKINS_DO_OPENMP=OFF +export JENKINS_DO_PTHREAD=OFF +export JENKINS_DO_SERIAL=ON +export JENKINS_DO_COMPLEX=ON + +export JENKINS_ARCH="Power8,Kepler37" +export JENKINS_ARCH_CXX_FLAG="-mcpu=power8 -arch=sm_37 --expt-extended-lambda --std=c++11" +export JENKINS_ARCH_C_FLAG="-mcpu=power8" +export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a;gfortran;gomp" +export LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/liblapack.a;gfortran;gomp;m" + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON + +export QUEUE=rhel7F + +module load python + +export KOKKOSKERNELS_PATH=${PWD}/kokkos-kernels + +#Already done: +if [ ! -d "${KOKKOSKERNELS_PATH}" ]; then + git clone https://github.com/kokkos/kokkos-kernels ${KOKKOSKERNELS_PATH} +fi + + +cd ${KOKKOSKERNELS_PATH} +git checkout $KOKKOSKERNELS_BRANCH +git pull +cd .. + +if [ -n $KOKKOS_UPDATE_BRANCH ] +then + export KOKKOS_PATH=${PWD}/kokkos + + if [ -z $KOKKOS_UPDATE_BRANCH ] + then + KOKKOS_UPDATE_BRANCH=develop + fi + + #Already done: + if [ ! -d "${KOKKOS_PATH}" ]; then + git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH} + fi + + cd ${KOKKOS_PATH} + git checkout $KOKKOS_UPDATE_BRANCH + git pull + cd .. +fi + +export CUDA_LAUNCH_BLOCKING=1 +export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 + +source ${KOKKOSKERNELS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH $KOKKOS_UPDATE_BRANCH + +export OMPI_CXX=${TRILINOS_UPDATED_PATH}/packages/kokkos/bin/nvcc_wrapper + +${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_lsf ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE} + From 065ac06293730cc2d6a56b03b08cb103962f7770 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 20 Dec 2018 13:16:57 -0700 Subject: [PATCH 34/47] test_all_sandia: update for proper complex scalar option Add extra help output --- scripts/test_all_sandia | 48 +++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 26eedfefdd..76d4fe52d0 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -407,10 +407,14 @@ RESULT_ROOT_PREFIX=TestAll if [ "$PRINT_HELP" = "True" ]; then echo "test_all_sandia :" + echo "" + echo "OPTIONS: list of flag options for non-default testing" + echo "" echo "--kokkoskernels-path=/Path/To/Kokkos: Path to the KokkosKernels root directory" echo " Defaults to root repo containing this script" echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" echo " Defaults to KokkosKernelsPath/../kokkos" + echo "" echo "--debug: Run tests in debug. Defaults to False" echo "--test-script: Test this script, not Kokkos" echo "--skip-hwloc: Do not do hwloc tests" @@ -418,9 +422,38 @@ if [ "$PRINT_HELP" = "True" ]; then echo "--spot-check: Minimal test set to issue pull request" echo "--dry-run: Just print what would be executed" echo "--build-only: Just do builds, don't run anything" - echo "--opt-flag=FLAG: Optimization flag (default: -O3)" + echo "" echo "--arch=ARCHITECTURE: overwrite architecture flags" + echo "" + echo "--with-kokkos-options=OPT: set KOKKOS_OPTIONS" echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" + echo "" + echo "--opt-flag=FLAG: Optimization flag (default: -O3)" + echo "--cxxflags-extra=FLAG(S): append extra flags to cxxflags" + echo "--ldflags-extra=FLAG(S): append extra ldflags to ldflags" + echo "--cxxstandard=STD: set KOKKOS_CXX_STANDARD" + echo "" + echo "--with-scalars=SCALARS: set KOKKOSKERNELS_SCALARS" + echo " Provide a comma-separated list scalar types" + echo " Valid items:" + echo " float, complex, double, complex" + echo " Example: SCALARS='double,\"complex\\"'" + echo "" + echo "--with-ordinals=ORDS: set KOKKOSKERNELS_ORDINALS" + echo " Provide a comma-separated list ordinal types" + echo " Valid items:" + echo " int, int64_t" + echo "" + echo "--with-offsets=OFFS: set KOKKOSKERNELS_OFFSETS" + echo " Provide a comma-separated list offset types" + echo " Valid items:" + echo " int, size_t" + echo "" + echo "--with-layouts=LAYOUTS: set KOKKOSKERNELS_LAYOUTS" + echo " Provide a comma-separated list layouts" + echo " Valid items:" + echo " LayoutLeft,LayoutRight" + echo "" echo "--build-list=BUILD,BUILD,BUILD..." echo " Provide a comma-separated list of builds instead of running all builds" echo " Valid items:" @@ -444,14 +477,14 @@ if [ "$PRINT_HELP" = "True" ]; then echo " Run all gcc tests" echo " % test_all_sandia gcc" echo "" - echo " Run all gcc/4.7.2 and all intel tests" - echo " % test_all_sandia gcc/4.7.2 intel" + echo " Run all gcc/4.8.4 and all intel tests" + echo " % test_all_sandia gcc/4.8.4 intel" echo "" echo " Run all tests in debug" echo " % test_all_sandia --debug" echo "" - echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds" - echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial" + echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" + echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" echo "" echo "If you want to kill the tests, do:" echo " hit ctrl-z" @@ -671,6 +704,9 @@ single_build_and_test() { local comment="no_comment" + # Keep variable wrapped in single quotes so complex passed correctly to generate_makefile + local kk_scalars=\'${KOKKOSKERNELS_SCALARS}\' + if [ "$TEST_SCRIPT" = "True" ]; then local rand=$[ 1 + $[ RANDOM % 10 ]] sleep $rand @@ -679,7 +715,7 @@ single_build_and_test() { run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } fi else - run_cmd ${KOKKOSKERNELS_PATH}/scripts/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $extra_args --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} --with-tpls=${KOKKOSKERNELS_ENABLE_TPLS} &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/scripts/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $extra_args --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} --with-tpls=${KOKKOSKERNELS_ENABLE_TPLS} &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local -i build_start_time=$(date +%s) run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } local -i build_end_time=$(date +%s) From 5379c884fdc8f1ce2b2a2950d82dec53fc3273d8 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Wed, 23 Jan 2019 11:50:28 -0700 Subject: [PATCH 35/47] KokkosBlas - there is a bug in complex dot. So far we compute zdotu which does res += a(i)*b(i) It should perform zdotc res += conj(a(i))*b(i) --- src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index a5ac866517..83555c0177 100644 --- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -51,9 +51,9 @@ extern "C" double ddot_ ( const int* N, const double* x, const int* x_inc, const double* y, const int* y_inc); extern "C" float sdot_ ( const int* N, const float* x, const int* x_inc, const float* y, const int* y_inc); -extern "C" void zdotu_( std::complex *res, const int* N, const std::complex* x, const int* x_inc, +extern "C" void zdotc_( std::complex *res, const int* N, const std::complex* x, const int* x_inc, const std::complex* y, const int* y_inc); -extern "C" void cdotu_( std::complex *res, const int* N, const std::complex* x, const int* x_inc, +extern "C" void cdotc_( std::complex *res, const int* N, const std::complex* x, const int* x_inc, const std::complex* y, const int* y_inc); namespace KokkosBlas { @@ -160,7 +160,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - zdotu_(reinterpret_cast* >(R.data()), \ + zdotc_(reinterpret_cast* >(R.data()), \ &N, \ reinterpret_cast* >(X.data()),&one, \ reinterpret_cast* >(Y.data()),&one); \ @@ -196,7 +196,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - cdotu_(reinterpret_cast* >(R.data()), \ + cdotc_(reinterpret_cast* >(R.data()), \ &N, \ reinterpret_cast* >(X.data()),&one, \ reinterpret_cast* >(Y.data()),&one); \ @@ -325,7 +325,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device (numElems); \ constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZdotu(s.handle, N, reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, reinterpret_cast(&R())); \ + cublasZdotc(s.handle, N, reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, reinterpret_cast(&R())); \ } else { \ Dot::dot(R,X,Y); \ } \ @@ -359,7 +359,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device (numElems); \ constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCdotu(s.handle, N, reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, reinterpret_cast(&R())); \ + cublasCdotc(s.handle, N, reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, reinterpret_cast(&R())); \ } else { \ Dot::dot(R,X,Y); \ } \ From 5d6b0452b5bf9a5ed2e8b88317e649bb8832eba5 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Wed, 23 Jan 2019 12:08:50 -0700 Subject: [PATCH 36/47] KokkosBlas - dot product test is wrong fix the expected value to be computed with conjugate of the first input. --- unit_test/blas/Test_Blas1_dot.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index 4875020942..87b7122328 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -10,6 +11,7 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; + typedef Kokkos::ArithTraits ats; typedef Kokkos::View::value?2*1e-5:1e-7; @@ -68,6 +70,7 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; + typedef Kokkos::ArithTraits ats; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; @@ -104,7 +107,7 @@ namespace Test { for(int j=0;j::value?2*1e-5:1e-7; From 5c4b7eba30186e8a2decb074acd6e42de31ce778 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Wed, 23 Jan 2019 16:16:01 -0700 Subject: [PATCH 37/47] KokkosBlas - hide blas fortran --- Makefile.kokkos-kernels | 42 + scripts/generate_makefile.bash | 2 +- .../tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 33 +- .../tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 32 +- .../tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 16 +- .../tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 14 +- .../tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp | 14 +- .../tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 24 +- .../tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 59 +- .../tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 133 +-- src/impl/tpls/KokkosBlas_Host_tpl.cpp | 818 ++++++++++++++++++ src/impl/tpls/KokkosBlas_Host_tpl.hpp | 95 ++ 12 files changed, 1104 insertions(+), 178 deletions(-) create mode 100644 src/impl/tpls/KokkosBlas_Host_tpl.cpp create mode 100644 src/impl/tpls/KokkosBlas_Host_tpl.hpp diff --git a/Makefile.kokkos-kernels b/Makefile.kokkos-kernels index 796732d5f1..d959d9d747 100644 --- a/Makefile.kokkos-kernels +++ b/Makefile.kokkos-kernels @@ -71,6 +71,37 @@ tmp := $(shell echo "----------------------------------------------*/" >> Kokkos tmp := $(shell echo "\#ifndef KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp) tmp := $(shell echo "\#define KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.tmp) + +#==== User-settable options for Fortran mangling macros ================= +#With Makefile build, we rely on users' input + +# default mangling scheme with a single under score +KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle_" | wc -l)) +ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE), 1) + KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_ +endif +KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle__" | wc -l)) +ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES), 1) + KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#__ +endif +KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle" | wc -l)) +ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE), 1) + KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\# +endif + +tmp := $(shell echo "" >> KokkosKernels_config.tmp) +tmp := $(shell echo "/* ---------------------------------------------" >> KokkosKernels_config.tmp) +tmp := $(shell echo "Fortran BLAS mangling:" >> KokkosKernels_config.tmp) +tmp := $(shell echo " ---------------------------------------------*/" >> KokkosKernels_config.tmp) +tmp := $(shell echo "\#if !defined(F77_BLAS_MANGLE)" >> KokkosKernels_config.tmp ) +tmp := $(shell echo "\#define F77_BLAS_MANGLE(name,NAME) $(KOKKOSKERNELS_FORTRAN_GLOBAL)" >> KokkosKernels_config.tmp) +tmp := $(shell echo "\#endif" >> KokkosKernels_config.tmp ) + +KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX ?= 0 +ifeq ($(KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX), 1) + tmp := $(shell echo "\#define KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX" >> KokkosKernels_config.tmp ) +endif + #==== ETI Macros Scalars ================================================= KOKKOSKERNELS_INTERNAL_INST_SCALARS = KOKKOSKERNELS_INTERNAL_INST_DOUBLE=$(strip $(shell echo "$(KOKKOSKERNELS_SCALARS)" | grep "double" | wc -l)) @@ -401,6 +432,17 @@ ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_CUBLAS}, 1) KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp endif +KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=0 +ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_BLAS}, 1) + KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1 +endif +ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_MKL}, 1) + KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS=1 +endif +ifeq (${KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS}, 1) + KOKKOSKERNELS_INTERNAL_SRC_BLAS += ${KOKKOSKERNELS_PATH}/src/impl/tpls/KokkosBlas_Host_tpl.cpp +endif + KOKKOSKERNELS_INTERNAL_HEADERS = $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/*.hpp) KOKKOSKERNELS_INTERNAL_HEADERS += $(wildcard ${KOKKOSKERNELS_PATH}/src/impl/generated_specializations_hpp/*/*eti_spec*.hpp) diff --git a/scripts/generate_makefile.bash b/scripts/generate_makefile.bash index 2443cb0b66..3c92608ff3 100755 --- a/scripts/generate_makefile.bash +++ b/scripts/generate_makefile.bash @@ -4,7 +4,7 @@ KOKKOS_DEVICES="" KOKKOS_DO_EXAMPLES="1" -KOKKOSKERNELS_OPTIONS="eti-only" +KOKKOSKERNELS_OPTIONS="eti-only,blas-mangle_" KOKKOSKERNELS_ENABLE_TPLS="" while [[ $# > 0 ]] diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index fea7103ea9..f664a6b4fd 100644 --- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -46,18 +46,7 @@ #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -extern "C" void daxpy_( const int* N, const double* alpha, - const double* x, const int* x_inc, - double* y, const int* y_inc); -extern "C" void saxpy_( const int* N, const float* alpha, - const float* x, const int* x_inc, - float* y, const int* y_inc); -extern "C" void zaxpy_( const int* N, const std::complex* alpha, - const std::complex* x, const int* x_inc, - std::complex* y, const int* y_inc); -extern "C" void caxpy_( const int* N, const std::complex* alpha, - const std::complex* x, const int* x_inc, - std::complex* y, const int* y_inc); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -95,7 +84,7 @@ struct Axpby< \ axpby_print_specialization(); \ int N = X.extent(0); \ int one = 1; \ - daxpy_(&N,&alpha,X.data(),&one,Y.data(),&one); \ + HostBlas::axpy(N,alpha,X.data(),one,Y.data(),one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ Kokkos::Profiling::popRegion(); \ @@ -127,7 +116,7 @@ struct Axpby< \ axpby_print_specialization(); \ int N = X.extent(0); \ int one = 1; \ - saxpy_(&N,&alpha,X.data(),&one,Y.data(),&one); \ + HostBlas::axpy(N,alpha,X.data(),one,Y.data(),one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ Kokkos::Profiling::popRegion(); \ @@ -158,9 +147,11 @@ struct Axpby< \ axpby_print_specialization(); \ int N = X.extent(0); \ int one = 1; \ - zaxpy_(&N,reinterpret_cast* >(&alpha), \ - reinterpret_cast* >(X.data()),&one, \ - reinterpret_cast* >(Y.data()),&one); \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy \ + (N,alpha_val, \ + reinterpret_cast*>(X.data()),one, \ + reinterpret_cast* >(Y.data()),one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ Kokkos::Profiling::popRegion(); \ @@ -191,9 +182,11 @@ struct Axpby< \ axpby_print_specialization(); \ int N = X.extent(0); \ int one = 1; \ - caxpy_(&N,reinterpret_cast* >(&alpha), \ - reinterpret_cast* >(X.data()),&one, \ - reinterpret_cast* >(Y.data()),&one); \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy \ + (N,alpha_val, \ + reinterpret_cast*>(X.data()),one, \ + reinterpret_cast* >(Y.data()),one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ Kokkos::Profiling::popRegion(); \ diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 83555c0177..4d23c0094d 100644 --- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -46,15 +46,7 @@ // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - -extern "C" double ddot_ ( const int* N, const double* x, const int* x_inc, - const double* y, const int* y_inc); -extern "C" float sdot_ ( const int* N, const float* x, const int* x_inc, - const float* y, const int* y_inc); -extern "C" void zdotc_( std::complex *res, const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); -extern "C" void cdotc_( std::complex *res, const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -94,7 +86,7 @@ Kokkos::View, \ dot_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = ddot_(&N,X.data(),&one,Y.data(),&one); \ + R() = HostBlas::dot(N,X.data(),one,Y.data(),one); \ } else { \ Dot::dot(R,X,Y); \ } \ @@ -127,7 +119,7 @@ Kokkos::View, \ dot_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = sdot_(&N,X.data(),&one,Y.data(),&one); \ + R() = HostBlas::dot(N,X.data(),one,Y.data(),one); \ } else { \ Dot::dot(R,X,Y); \ } \ @@ -160,10 +152,10 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - zdotc_(reinterpret_cast* >(R.data()), \ - &N, \ - reinterpret_cast* >(X.data()),&one, \ - reinterpret_cast* >(Y.data()),&one); \ + R() = HostBlas >::dot \ + (N, \ + reinterpret_cast* >(X.data()),one, \ + reinterpret_cast* >(Y.data()),one); \ } else { \ Dot::dot(R,X,Y); \ } \ @@ -196,11 +188,11 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - cdotc_(reinterpret_cast* >(R.data()), \ - &N, \ - reinterpret_cast* >(X.data()),&one, \ - reinterpret_cast* >(Y.data()),&one); \ - } else { \ + R() = HostBlas >::dot \ + (N, \ + reinterpret_cast* >(X.data()),one, \ + reinterpret_cast* >(Y.data()),one); \ + } else { \ Dot::dot(R,X,Y); \ } \ Kokkos::Profiling::popRegion(); \ diff --git a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index e185ca91b7..eb88f90355 100644 --- a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -46,11 +46,7 @@ // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - -extern "C" double dasum_ ( const int* N, const double* x, const int* x_inc); -extern "C" float sasum_ ( const int* N, const float* x, const int* x_inc); -extern "C" double dzasum_( const int* N, const std::complex* x, const int* x_inc); -extern "C" float dcasum_( const int* N, const std::complex* x, const int* x_inc); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -88,7 +84,7 @@ Kokkos::View, \ nrm1_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = dasum_(&N,X.data(),&one); \ + R() = HostBlas::asum(N,X.data(),one); \ } else { \ Nrm1::nrm1(R,X); \ } \ @@ -119,7 +115,7 @@ Kokkos::View, \ nrm1_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = sasum_(&N,X.data(),&one); \ + R() = HostBlas::asum(N,X.data(),one); \ } else { \ Nrm1::nrm1(R,X); \ } \ @@ -149,8 +145,8 @@ Kokkos::View*, LAYOUT, Kokkos::Device (INT_MAX)) { \ nrm1_print_specialization(); \ int N = numElems; \ - int one = 1; \ - R() = dzasum_(&N,reinterpret_cast*>(X.data()),&one); \ + int one = 1; \ + R() = HostBlas >::asum(N,reinterpret_cast*>(X.data()),one); \ } else { \ Nrm1::nrm1(R,X); \ } \ @@ -181,7 +177,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - R() = dcasum_(&N,reinterpret_cast*>(X.data()),&one); \ + R() = HostBlas >::asum(N,reinterpret_cast*>(X.data()),one); \ } else { \ Nrm1::nrm1(R,X); \ } \ diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 640adb0c49..d0040839da 100644 --- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -46,11 +46,7 @@ // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - -extern "C" double dnrm2_ ( const int* N, const double* x, const int* x_inc); -extern "C" float snrm2_ ( const int* N, const float* x, const int* x_inc); -extern "C" double dznrm2_( const int* N, const std::complex* x, const int* x_inc); -extern "C" float scnrm2_( const int* N, const std::complex* x, const int* x_inc); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -88,7 +84,7 @@ Kokkos::View, \ nrm2_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = dnrm2_(&N,X.data(),&one); \ + R() = HostBlas::nrm2(N,X.data(),one); \ if(!take_sqrt) R() = R()*R(); \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ @@ -120,7 +116,7 @@ Kokkos::View, \ nrm2_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = snrm2_(&N,X.data(),&one); \ + R() = HostBlas::nrm2(N,X.data(),one); \ if(!take_sqrt) R() = R()*R(); \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ @@ -152,7 +148,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - R() = dznrm2_(&N,reinterpret_cast*>(X.data()),&one); \ + R() = HostBlas >::nrm2(N,reinterpret_cast*>(X.data()),one); \ if(!take_sqrt) R() = R()*R(); \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ @@ -184,7 +180,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - R() = scnrm2_(&N,reinterpret_cast*>(X.data()),&one); \ + R() = HostBlas >::nrm2(N,reinterpret_cast*>(X.data()),one); \ if(!take_sqrt) R() = R()*R(); \ } else { \ Nrm2::nrm2(R,X,take_sqrt); \ diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index e0661fee21..485830ec1f 100644 --- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -46,11 +46,7 @@ // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - -extern "C" int idamax_( const int* N, const double* x, const int* x_inc); -extern "C" int isamax_( const int* N, const float* x, const int* x_inc); -extern "C" int izamax_( const int* N, const std::complex* x, const int* x_inc); -extern "C" int icamax_( const int* N, const std::complex* x, const int* x_inc); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -89,7 +85,7 @@ Kokkos::View, \ nrminf_print_specialization(); \ int N = numElems; \ int one = 1; \ - int idx = idamax_(&N,X.data(),&one)-1; \ + int idx = HostBlas::iamax(N,X.data(),one)-1; \ R() = X(idx); \ } else { \ NrmInf::nrminf(R,X); \ @@ -122,7 +118,7 @@ Kokkos::View, \ nrminf_print_specialization(); \ int N = numElems; \ int one = 1; \ - int idx = isamax_(&N,X.data(),&one)-1; \ + int idx = HostBlas::iamax(N,X.data(),one)-1; \ R() = X(idx); \ } else { \ NrmInf::nrminf(R,X); \ @@ -156,7 +152,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - int idx = izamax_(&N,reinterpret_cast*>(X.data()),&one)-1; \ + int idx = HostBlas >::iamax(N,reinterpret_cast*>(X.data()),one)-1; \ R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ @@ -190,7 +186,7 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - int idx = icamax_(&N,reinterpret_cast*>(X.data()),&one)-1; \ + int idx = HostBlas >::iamax(N,reinterpret_cast*>(X.data()),one)-1; \ R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index cb24ace50c..8061ef0dd3 100644 --- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -46,15 +46,7 @@ // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - -extern "C" void dscal_( const int* N, const double* alpha, - double* x, const int* x_inc); -extern "C" void sscal_( const int* N, const float* alpha, - float* x, const int* x_inc); -extern "C" void zscal_( const int* N, const std::complex* alpha, - std::complex* x, const int* x_inc); -extern "C" void cscal_( const int* N, const std::complex* alpha, - std::complex* x, const int* x_inc); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -94,7 +86,7 @@ Kokkos::View, \ scal_print_specialization(); \ int N = numElems; \ int one = 1; \ - dscal_(&N,&alpha,R.data(),&one); \ + HostBlas::scal(N,alpha,R.data(),one); \ } else { \ Scal::scal(R,alpha,X); \ } \ @@ -127,7 +119,7 @@ Kokkos::View, \ scal_print_specialization(); \ int N = numElems; \ int one = 1; \ - sscal_(&N,&alpha,R.data(),&one); \ + HostBlas::scal(N,alpha,R.data(),one); \ } else { \ Scal::scal(R,alpha,X); \ } \ @@ -160,7 +152,10 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - zscal_(&N,reinterpret_cast*>(&alpha),reinterpret_cast*>(R.data()),&one); \ + const std::complex alpha_val = alpha; \ + HostBlas >::scal\ + (N,alpha_val, \ + reinterpret_cast*>(R.data()), one); \ } else { \ Scal::scal(R,alpha,X); \ } \ @@ -193,7 +188,10 @@ Kokkos::View*, LAYOUT, Kokkos::Device(); \ int N = numElems; \ int one = 1; \ - cscal_(&N,reinterpret_cast*>(&alpha),reinterpret_cast*>(R.data()),&one); \ + const std::complex alpha_val = alpha; \ + HostBlas >::scal\ + (N,alpha_val, \ + reinterpret_cast*>(R.data()), one); \ } else { \ Scal::scal(R,alpha,X); \ } \ diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 1f015509a2..49a5f4ad34 100644 --- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -45,34 +45,7 @@ #define KOKKOSBLAS2_GEMV_TPL_SPEC_DECL_HPP_ #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -extern "C" void dgemv_( const char* trans, - const int* M, const int* N, - const double* alpha, - const double* A, const int* LDA, - const double* X, const int* INCX, - const double* beta, - double* Y, const int* INCY); -extern "C" void sgemv_( const char* trans, - const int* M, const int* N, - const float* alpha, - const float* A, const int* LDA, - const float* X, const int* INCX, - const float* beta, - float* Y, const int* INCY); -extern "C" void zgemv_( const char* trans, - const int* M, const int* N, - const std::complex* alpha, - const std::complex* A, const int* LDA, - const std::complex* X, const int* INCX, - const std::complex* beta, - std::complex* Y, const int* INCY); -extern "C" void cgemv_( const char* trans, - const int* M, const int* N, - const std::complex* alpha, - const std::complex* A, const int* LDA, - const std::complex* X, const int* INCX, - const std::complex* beta, - std::complex* Y, const int* INCY); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -109,7 +82,7 @@ struct GEMV< \ constexpr int one = 1; \ bool A_is_lr = std::is_same::value; \ const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \ - dgemv_(trans,&M,&N,&alpha,A.data(),&LDA,X.data(),&one,&beta,Y.data(),&one); \ + HostBlas::gemv(trans[0],M,N,alpha,A.data(),LDA,X.data(),one,beta,Y.data(),one); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -146,7 +119,7 @@ struct GEMV< \ constexpr int one = 1; \ bool A_is_lr = std::is_same::value; \ const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \ - sgemv_(trans,&M,&N,&alpha,A.data(),&LDA,X.data(),&one,&beta,Y.data(),&one); \ + HostBlas::gemv(trans[0],M,N,alpha,A.data(),LDA,X.data(),one,beta,Y.data(),one); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -183,11 +156,15 @@ struct GEMV< \ constexpr int one = 1; \ bool A_is_lr = std::is_same::value; \ const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \ - zgemv_(trans,&M,&N, \ - reinterpret_cast*>(&alpha),reinterpret_cast*>(A.data()),&LDA, \ - reinterpret_cast*>(X.data()),&one, \ - reinterpret_cast*>(&beta),reinterpret_cast*>(Y.data()),&one); \ - Kokkos::Profiling::popRegion(); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv \ + (trans[0],M,N, \ + alpha_val, \ + reinterpret_cast*>(A.data()),LDA, \ + reinterpret_cast*>(X.data()),one, \ + beta_val, \ + reinterpret_cast< std::complex*>(Y.data()),one); \ + Kokkos::Profiling::popRegion(); \ } \ }; \ @@ -223,10 +200,14 @@ struct GEMV< \ constexpr int one = 1; \ bool A_is_lr = std::is_same::value; \ const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \ - cgemv_(trans,&M,&N, \ - reinterpret_cast*>(&alpha),reinterpret_cast*>(A.data()),&LDA, \ - reinterpret_cast*>(X.data()),&one, \ - reinterpret_cast*>(&beta),reinterpret_cast*>(Y.data()),&one); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv \ + (trans[0],M,N, \ + alpha_val, \ + reinterpret_cast*>(A.data()),LDA, \ + reinterpret_cast*>(X.data()),one, \ + beta_val, \ + reinterpret_cast< std::complex*>(Y.data()),one); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index b129c41ccb..d04a519535 100644 --- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -45,34 +45,7 @@ #define KOKKOSBLAS3_GEMM_TPL_SPEC_DECL_HPP_ #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -extern "C" void dgemm_( const char* transA, const char* transB, - const int* M, const int* N, const int* K, - const double* alpha, - const double* A, const int* LDA, - const double* B, const int* LDB, - const double* beta, - double* C, const int* LDC); -extern "C" void sgemm_( const char* transA, const char* transB, - const int* M, const int* N, const int* K, - const float* alpha, - const float* A, const int* LDA, - const float* B, const int* LDB, - const float* beta, - float* C, const int* LDC); -extern "C" void zgemm_( const char* transA, const char* transB, - const int* M, const int* N, const int* K, - const std::complex* alpha, - const std::complex* A, const int* LDA, - const std::complex* B, const int* LDB, - const std::complex* beta, - std::complex* C, const int* LDC); -extern "C" void cgemm_( const char* transA, const char* transB, - const int* M, const int* N, const int* K, - const std::complex* alpha, - const std::complex* A, const int* LDA, - const std::complex* B, const int* LDB, - const std::complex* beta, - std::complex* C, const int* LDC); +#include "KokkosBlas_Host_tpl.hpp" namespace KokkosBlas { namespace Impl { @@ -118,11 +91,25 @@ struct GEMM< \ const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \ const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \ \ - if(!A_is_lr && !B_is_lr && !C_is_lr ) \ - dgemm_(transA,transB,&M,&N,&K,&alpha,A.data(),&LDA,B.data(),&LDB,&beta,C.data(),&LDC); \ - if(A_is_lr && B_is_lr && C_is_lr ) \ - dgemm_(transB,transA,&N,&M,&K,&alpha,B.data(),&LDB,A.data(),&LDA,&beta,C.data(),&LDC); \ - Kokkos::Profiling::popRegion(); \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + HostBlas::gemm \ + (transA[0],transB[0], \ + M,N,K, \ + alpha, \ + A.data(),LDA, \ + B.data(),LDB, \ + beta, \ + C.data(),LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + HostBlas::gemm \ + (transB[0],transA[0], \ + N,M,K, \ + alpha, \ + B.data(),LDB, \ + A.data(),LDA, \ + beta, \ + C.data(),LDC); \ + Kokkos::Profiling::popRegion(); \ } \ }; @@ -167,10 +154,24 @@ struct GEMM< \ const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \ const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \ \ - if(!A_is_lr && !B_is_lr && !C_is_lr ) \ - sgemm_(transA,transB,&M,&N,&K,&alpha,A.data(),&LDA,B.data(),&LDB,&beta,C.data(),&LDC); \ - if(A_is_lr && B_is_lr && C_is_lr ) \ - sgemm_(transB,transA,&N,&M,&K,&alpha,B.data(),&LDB,A.data(),&LDA,&beta,C.data(),&LDC); \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + HostBlas::gemm \ + (transA[0],transB[0], \ + M,N,K, \ + alpha, \ + A.data(),LDA, \ + B.data(),LDB, \ + beta, \ + C.data(),LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + HostBlas::gemm \ + (transB[0],transA[0], \ + N,M,K, \ + alpha, \ + B.data(),LDB, \ + A.data(),LDA, \ + beta, \ + C.data(),LDC); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -216,16 +217,25 @@ struct GEMM< \ const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \ const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \ \ - if(!A_is_lr && !B_is_lr && !C_is_lr ) \ - zgemm_(transA,transB,&M,&N,&K, \ - reinterpret_cast*>(&alpha),reinterpret_cast*>(A.data()),&LDA, \ - reinterpret_cast*>(B.data()),&LDB, \ - reinterpret_cast*>(&beta),reinterpret_cast*>(C.data()),&LDC); \ - if(A_is_lr && B_is_lr && C_is_lr ) \ - zgemm_(transB,transA,&N,&M,&K, \ - reinterpret_cast*>(&alpha),reinterpret_cast*>(B.data()),&LDB, \ - reinterpret_cast*>(A.data()),&LDA, \ - reinterpret_cast*>(&beta),reinterpret_cast*>(C.data()),&LDC); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + HostBlas >::gemm \ + (transA[0],transB[0], \ + M,N,K, \ + alpha_val, \ + reinterpret_cast*>(A.data()),LDA, \ + reinterpret_cast*>(B.data()),LDB, \ + beta_val, \ + reinterpret_cast< std::complex*>(C.data()),LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + HostBlas >::gemm \ + (transB[0],transA[0], \ + N,M,K, \ + alpha_val, \ + reinterpret_cast*>(B.data()),LDB, \ + reinterpret_cast*>(A.data()),LDA, \ + beta_val, \ + reinterpret_cast< std::complex*>(C.data()),LDC); \ Kokkos::Profiling::popRegion(); \ } \ }; \ @@ -271,16 +281,25 @@ struct GEMM< \ const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \ const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \ \ - if(!A_is_lr && !B_is_lr && !C_is_lr ) \ - cgemm_(transA,transB,&M,&N,&K, \ - reinterpret_cast*>(&alpha),reinterpret_cast*>(A.data()),&LDA, \ - reinterpret_cast*>(B.data()),&LDB, \ - reinterpret_cast*>(&beta),reinterpret_cast*>(C.data()),&LDC); \ - if(A_is_lr && B_is_lr && C_is_lr ) \ - cgemm_(transB,transA,&N,&M,&K, \ - reinterpret_cast*>(&alpha),reinterpret_cast*>(B.data()),&LDB, \ - reinterpret_cast*>(A.data()),&LDA, \ - reinterpret_cast*>(&beta),reinterpret_cast*>(C.data()),&LDC); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + if(!A_is_lr && !B_is_lr && !C_is_lr ) \ + HostBlas >::gemm \ + (transA[0],transB[0], \ + M,N,K, \ + alpha_val, \ + reinterpret_cast*>(A.data()),LDA, \ + reinterpret_cast*>(B.data()),LDB, \ + beta_val, \ + reinterpret_cast< std::complex*>(C.data()),LDC); \ + if(A_is_lr && B_is_lr && C_is_lr ) \ + HostBlas >::gemm \ + (transB[0],transA[0], \ + N,M,K, \ + alpha_val, \ + reinterpret_cast*>(B.data()),LDB, \ + reinterpret_cast*>(A.data()),LDA, \ + beta_val, \ + reinterpret_cast< std::complex*>(C.data()),LDC); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas_Host_tpl.cpp b/src/impl/tpls/KokkosBlas_Host_tpl.cpp new file mode 100644 index 0000000000..19de2e5d02 --- /dev/null +++ b/src/impl/tpls/KokkosBlas_Host_tpl.cpp @@ -0,0 +1,818 @@ +/// \file KokkosBlas_Host_tpl.cpp +/// \brief BLAS wrapper for host tpls +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosKernels_config.h" +#include "KokkosBlas_Host_tpl.hpp" + +#if defined( KOKKOSKERNELS_ENABLE_TPL_BLAS ) + +/// Fortran headers +extern "C" { + + /// + /// scal + /// + void F77_BLAS_MANGLE(sscal,SSCAL)( const int* N, + const float* alpha, + /* */ float* x, const int* x_inc); + void F77_BLAS_MANGLE(dscal,DSCAL)( const int* N, + const double* alpha, + /* */ double* x, const int* x_inc); + void F77_BLAS_MANGLE(cscal,CSCAL)( const int* N, + const std::complex* alpha, + /* */ std::complex* x, const int* x_inc); + void F77_BLAS_MANGLE(zscal,ZSCAL)( const int* N, + const std::complex* alpha, + /* */ std::complex* x, const int* x_inc); + + /// + /// max + /// + int F77_BLAS_MANGLE(isamax,ISAMAX)( const int* N, const float* x, const int* x_inc); + int F77_BLAS_MANGLE(idamax,IDAMAX)( const int* N, const double* x, const int* x_inc); + int F77_BLAS_MANGLE(icamax,ICAMAX)( const int* N, const std::complex* x, const int* x_inc); + int F77_BLAS_MANGLE(izamax,IZAMAX)( const int* N, const std::complex* x, const int* x_inc); + + + /// + /// nrm2 + /// + float F77_BLAS_MANGLE(snrm2, SNRM2 )( const int* N, const float* x, const int* x_inc); + double F77_BLAS_MANGLE(dnrm2, DNRM2 )( const int* N, const double* x, const int* x_inc); + float F77_BLAS_MANGLE(scnrm2,SCNRM2)( const int* N, const std::complex* x, const int* x_inc); + double F77_BLAS_MANGLE(dznrm2,DZNRM2)( const int* N, const std::complex* x, const int* x_inc); + + /// + /// sum + /// + float F77_BLAS_MANGLE(sasum, SASUM )( const int* N, const float* x, const int* x_inc); + double F77_BLAS_MANGLE(dasum, DASUM )( const int* N, const double* x, const int* x_inc); + float F77_BLAS_MANGLE(scasum,SCASUM)( const int* N, const std::complex* x, const int* x_inc); + double F77_BLAS_MANGLE(dzasum,DZASUM)( const int* N, const std::complex* x, const int* x_inc); + + /// + /// dot + /// + float F77_BLAS_MANGLE(sdot,SDOT)( const int* N, const float* x, const int* x_inc, + const float* y, const int* y_inc); + double F77_BLAS_MANGLE(ddot,DDOT)( const int* N, const double* x, const int* x_inc, + const double* y, const int* y_inc); +# if defined( KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX ) + std::complex F77_BLAS_MANGLE(cdotu,CDOTU)( const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); + std::complex F77_BLAS_MANGLE(zdotu,ZDOTU)( const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); + std::complex F77_BLAS_MANGLE(cdotc,CDOTC)( const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); + std::complex F77_BLAS_MANGLE(zdotc,ZDOTC)( const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); +# else + void F77_BLAS_MANGLE(cdotu,CDOTU)( std::complex *res, + const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); + void F77_BLAS_MANGLE(zdotu,ZDOTU)( std::complex *res, + const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); + void F77_BLAS_MANGLE(cdotc,CDOTC)( std::complex *res, + const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); + void F77_BLAS_MANGLE(zdotc,ZDOTC)( std::complex *res, + const int* N, + const std::complex* x, const int* x_inc, + const std::complex* y, const int* y_inc); +# endif + + /// + /// axpy + /// + void F77_BLAS_MANGLE(saxpy,SAXPY)( const int* N, + const float* alpha, + const float* x, const int* x_inc, + /* */ float* y, const int* y_inc); + void F77_BLAS_MANGLE(daxpy,DAXPY)( const int* N, + const double* alpha, + const double* x, const int* x_inc, + /* */ double* y, const int* y_inc); + void F77_BLAS_MANGLE(caxpy,CAXPY)( const int* N, + const std::complex* alpha, + const std::complex* x, const int* x_inc, + /* */ std::complex* y, const int* y_inc); + void F77_BLAS_MANGLE(zaxpy,ZAXPY)( const int* N, + const std::complex* alpha, + const std::complex* x, const int* x_inc, + /* */ std::complex* y, const int* y_inc); + + /// + /// Gemv + /// + void F77_BLAS_MANGLE(sgemv,SGEMV)( const char*, + int*, int*, + const float*, + const float*, int*, + const float*, int*, + const float*, + /* */ float*, int* ); + void F77_BLAS_MANGLE(dgemv,DGEMV)( const char*, + int*, int*, + const double*, + const double*, int*, + const double*, int*, + const double*, + /* */ double*, int* ); + void F77_BLAS_MANGLE(cgemv,CGEMV)( const char*, + int*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + const std::complex*, + /* */ std::complex*, int* ); + void F77_BLAS_MANGLE(zgemv,ZGEMV)( const char*, + int*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + const std::complex*, + /* */ std::complex*, int* ); + + /// + /// Trsv + /// + + void F77_BLAS_MANGLE(strsv,STRSV)( const char*, const char*, const char*, + int*, + const float*, int*, + /* */ float*, int* ); + void F77_BLAS_MANGLE(dtrsv,DTRSV)( const char*, const char*, const char*, + int*, + const double*, int*, + /* */ double*, int* ); + void F77_BLAS_MANGLE(ctrsv,CTRSV)( const char*, const char*, const char*, + int*, + const std::complex*, int*, + /* */ std::complex*, int* ); + void F77_BLAS_MANGLE(ztrsv,ZTRSV)( const char*, const char*, const char*, + int*, + const std::complex*, int*, + /* */ std::complex*, int* ); + + /// + /// Gemm + /// + + void F77_BLAS_MANGLE(sgemm,SGEMM)( const char*, const char*, + int*, int*, int*, + const float*, + const float*, int*, + const float*, int*, + const float*, + /* */ float*, int* ); + void F77_BLAS_MANGLE(dgemm,DGEMM)( const char*, const char*, + int*, int*, int*, + const double*, + const double*, int*, + const double*, int*, + const double*, + /* */ double*, int* ); + void F77_BLAS_MANGLE(cgemm,CGEMM)( const char*, const char*, + int*, int*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + const std::complex*, + /* */ std::complex*, int* ); + void F77_BLAS_MANGLE(zgemm,ZGEMM)( const char*, const char*, + int*, int*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + const std::complex*, + /* */ std::complex*, int* ); + + /// + /// Herk + /// + + void F77_BLAS_MANGLE(ssyrk,SSYRK)( const char*, const char*, + int*, int*, + const float*, + const float*, int*, + const float*, + /* */ float*, int* ); + void F77_BLAS_MANGLE(dsyrk,DSYRK)( const char*, const char*, + int*, int*, + const double*, + const double*, int*, + const double*, + /* */ double*, int* ); + void F77_BLAS_MANGLE(cherk,CHERK)( const char*, const char*, + int*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, + /* */ std::complex*, int* ); + void F77_BLAS_MANGLE(zherk,ZHERK)( const char*, const char*, + int*, int*, + const std::complex*, + const std::complex*, int*, + const std::complex*, + /* */ std::complex*, int* ); + + /// + /// Trsm + /// + + void F77_BLAS_MANGLE(strsm,STRSM)( const char*, const char*, const char*, const char*, + int*, int*, + const float*, + const float*, int*, + /* */ float*, int* ); + void F77_BLAS_MANGLE(dtrsm,DTRSM)( const char*, const char*, const char*, const char*, + int*, int*, + const double*, + const double*, int*, + /* */ double*, int* ); + void F77_BLAS_MANGLE(ctrsm,CTRSM)( const char*, const char*, const char*, const char*, + int*, int*, + const std::complex*, + const std::complex*, int*, + /* */ std::complex*, int* ); + void F77_BLAS_MANGLE(ztrsm,ZTRSM)( const char*, const char*, const char*, const char*, + int*, int*, + const std::complex*, + const std::complex*, int*, + /* */ std::complex*, int* ); +} + + + + void F77_BLAS_MANGLE(sscal,SSCAL)( const int* N, + const float* alpha, + /* */ float* x, const int* x_inc); + void F77_BLAS_MANGLE(dscal,DSCAL)( const int* N, + const double* alpha, + /* */ double* x, const int* x_inc); + void F77_BLAS_MANGLE(cscal,CSCAL)( const int* N, + const std::complex* alpha, + /* */ std::complex* x, const int* x_inc); + void F77_BLAS_MANGLE(zscal,ZSCAL)( const int* N, + const std::complex* alpha, + /* */ std::complex* x, const int* x_inc); + +#define F77_FUNC_SSCAL F77_BLAS_MANGLE(sscal,SSCAL) +#define F77_FUNC_DSCAL F77_BLAS_MANGLE(dscal,DSCAL) +#define F77_FUNC_CSCAL F77_BLAS_MANGLE(cscal,CSCAL) +#define F77_FUNC_ZSCAL F77_BLAS_MANGLE(zscal,ZSCAL) + +#define F77_FUNC_ISAMAX F77_BLAS_MANGLE(isamax,ISAMAX) +#define F77_FUNC_IDAMAX F77_BLAS_MANGLE(idamax,IDAMAX) +#define F77_FUNC_ICAMAX F77_BLAS_MANGLE(icamax,ICAMAX) +#define F77_FUNC_IZAMAX F77_BLAS_MANGLE(izamax,IZAMAX) + +#define F77_FUNC_SNRM2 F77_BLAS_MANGLE(snrm2, SNRM2 ) +#define F77_FUNC_DNRM2 F77_BLAS_MANGLE(dnrm2, DNRM2 ) +#define F77_FUNC_SCNRM2 F77_BLAS_MANGLE(scnrm2, SCNRM2) +#define F77_FUNC_DZNRM2 F77_BLAS_MANGLE(dznrm2, DZNRM2) + +#define F77_FUNC_SASUM F77_BLAS_MANGLE(sasum, SASUM ) +#define F77_FUNC_DASUM F77_BLAS_MANGLE(dasum, DASUM ) +#define F77_FUNC_SCASUM F77_BLAS_MANGLE(scasum, SCASUM) +#define F77_FUNC_DZASUM F77_BLAS_MANGLE(dzasum, DZASUM) + +#define F77_FUNC_SDOT F77_BLAS_MANGLE(sdot,SDOT) +#define F77_FUNC_DDOT F77_BLAS_MANGLE(ddot,DDOT) +#define F77_FUNC_CDOTU F77_BLAS_MANGLE(cdotu,CDOTU) +#define F77_FUNC_ZDOTU F77_BLAS_MANGLE(zdotu,ZDOTU) +#define F77_FUNC_CDOTC F77_BLAS_MANGLE(cdotc,CDOTC) +#define F77_FUNC_ZDOTC F77_BLAS_MANGLE(zdotc,ZDOTC) + +#define F77_FUNC_SAXPY F77_BLAS_MANGLE(saxpy,SAXPY) +#define F77_FUNC_DAXPY F77_BLAS_MANGLE(daxpy,DAXPY) +#define F77_FUNC_CAXPY F77_BLAS_MANGLE(caxpy,CAXPY) +#define F77_FUNC_ZAXPY F77_BLAS_MANGLE(zaxpy,ZAXPY) + +#define F77_FUNC_SGEMV F77_BLAS_MANGLE(sgemv,SGEMV) +#define F77_FUNC_DGEMV F77_BLAS_MANGLE(dgemv,DGEMV) +#define F77_FUNC_CGEMV F77_BLAS_MANGLE(cgemv,CGEMV) +#define F77_FUNC_ZGEMV F77_BLAS_MANGLE(zgemv,ZGEMV) + +#define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv,STRSV) +#define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv,DTRSV) +#define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv,CTRSV) +#define F77_FUNC_ZTRSV F77_BLAS_MANGLE(ztrsv,ZTRSV) + +#define F77_FUNC_SGEMM F77_BLAS_MANGLE(sgemm,SGEMM) +#define F77_FUNC_DGEMM F77_BLAS_MANGLE(dgemm,DGEMM) +#define F77_FUNC_CGEMM F77_BLAS_MANGLE(cgemm,CGEMM) +#define F77_FUNC_ZGEMM F77_BLAS_MANGLE(zgemm,ZGEMM) + +#define F77_FUNC_SSYRK F77_BLAS_MANGLE(ssyrk,SSYRK) +#define F77_FUNC_DSYRK F77_BLAS_MANGLE(dsyrk,DSYRK) +#define F77_FUNC_CHERK F77_BLAS_MANGLE(cherk,CHERK) +#define F77_FUNC_ZHERK F77_BLAS_MANGLE(zherk,ZHERK) + +#define F77_FUNC_STRSM F77_BLAS_MANGLE(strsm,STRSM) +#define F77_FUNC_DTRSM F77_BLAS_MANGLE(dtrsm,DTRSM) +#define F77_FUNC_CTRSM F77_BLAS_MANGLE(ctrsm,CTRSM) +#define F77_FUNC_ZTRSM F77_BLAS_MANGLE(ztrsm,ZTRSM) + +namespace KokkosBlas { + namespace Impl { + + /// + /// float + /// + + template<> + void + HostBlas::scal(int n, + const float alpha, + /* */ float *x, int x_inc) { + F77_FUNC_SSCAL(&n, &alpha, x, &x_inc); + } + template<> + int + HostBlas::iamax(int n, + const float *x, int x_inc) { + return F77_FUNC_ISAMAX(&n, x, &x_inc); + } + template<> + float + HostBlas::nrm2(int n, + const float *x, int x_inc) { + return F77_FUNC_SNRM2(&n, x, &x_inc); + } + template<> + float + HostBlas::asum(int n, + const float *x, int x_inc) { + return F77_FUNC_SASUM(&n, x, &x_inc); + } + template<> + float + HostBlas::dot(int n, + const float *x, int x_inc, + const float *y, int y_inc) { + return F77_FUNC_SDOT(&n, x, &x_inc, y, &y_inc); + } + template<> + void + HostBlas::axpy(int n, + const float alpha, + const float *x, int x_inc, + /* */ float *y, int y_inc) { + F77_FUNC_SAXPY(&n, &alpha, x, &x_inc, y, &y_inc); + } + template<> + void + HostBlas::gemv(const char trans, + int m, int n, + const float alpha, + const float *a, int lda, + const float *b, int ldb, + const float beta, + /* */ float *c, int ldc) { + F77_FUNC_SGEMV(&trans, + &m, &n, + &alpha, + a, &lda, + b, &ldb, + &beta, + c, &ldc); + } + template<> + void + HostBlas::trsv(const char uplo, const char transa, const char diag, + int m, + const float *a, int lda, + /* */ float *b, int ldb) { + F77_FUNC_STRSV(&uplo, &transa, &diag, + &m, + a, &lda, + b, &ldb); + } + template<> + void + HostBlas::gemm(const char transa, const char transb, + int m, int n, int k, + const float alpha, + const float *a, int lda, + const float *b, int ldb, + const float beta, + /* */ float *c, int ldc) { + F77_FUNC_SGEMM(&transa, &transb, + &m, &n, &k, + &alpha, + a, &lda, + b, &ldb, + &beta, + c, &ldc); + } + template<> + void + HostBlas::herk(const char transa, const char transb, + int n, int k, + const float alpha, + const float *a, int lda, + const float beta, + /* */ float *c, int ldc) { + F77_FUNC_SSYRK(&transa, &transb, + &n, &k, + &alpha, + a, &lda, + &beta, + c, &ldc); + } + template<> + void + HostBlas::trsm(const char side, const char uplo, const char transa, const char diag, + int m, int n, + const float alpha, + const float *a, int lda, + /* */ float *b, int ldb) { + F77_FUNC_STRSM(&side, &uplo, &transa, &diag, + &m, &n, + &alpha, + a, &lda, + b, &ldb); + } + + /// + /// double + /// + + template<> + void + HostBlas::scal(int n, + const double alpha, + /* */ double *x, int x_inc) { + F77_FUNC_DSCAL(&n, &alpha, x, &x_inc); + } + template<> + int + HostBlas::iamax(int n, + const double *x, int x_inc) { + return F77_FUNC_IDAMAX(&n, x, &x_inc); + } + template<> + double + HostBlas::nrm2(int n, + const double *x, int x_inc) { + return F77_FUNC_DNRM2(&n, x, &x_inc); + } + template<> + double + HostBlas::asum(int n, + const double *x, int x_inc) { + return F77_FUNC_DASUM(&n, x, &x_inc); + } + template<> + double + HostBlas::dot(int n, + const double *x, int x_inc, + const double *y, int y_inc) { + return F77_FUNC_DDOT(&n, x, &x_inc, y, &y_inc); + } + template<> + void + HostBlas::axpy(int n, + const double alpha, + const double *x, int x_inc, + /* */ double *y, int y_inc) { + F77_FUNC_DAXPY(&n, &alpha, x, &x_inc, y, &y_inc); + } + template<> + void + HostBlas::gemv(const char trans, + int m, int n, + const double alpha, + const double *a, int lda, + const double *b, int ldb, + const double beta, + /* */ double *c, int ldc) { + F77_FUNC_DGEMV(&trans, + &m, &n, + &alpha, + a, &lda, + b, &ldb, + &beta, + c, &ldc); + } + template<> + void + HostBlas::trsv(const char uplo, const char transa, const char diag, + int m, + const double *a, int lda, + /* */ double *b, int ldb) { + F77_FUNC_DTRSV(&uplo, &transa, &diag, + &m, + a, &lda, + b, &ldb); + } + template<> + void + HostBlas::gemm(const char transa, const char transb, + int m, int n, int k, + const double alpha, + const double *a, int lda, + const double *b, int ldb, + const double beta, + /* */ double *c, int ldc) { + F77_FUNC_DGEMM(&transa, &transb, + &m, &n, &k, + &alpha, + a, &lda, + b, &ldb, + &beta, + c, &ldc); + } + template<> + void + HostBlas::herk(const char transa, const char transb, + int n, int k, + const double alpha, + const double *a, int lda, + const double beta, + /* */ double *c, int ldc) { + F77_FUNC_DSYRK(&transa, &transb, + &n, &k, + &alpha, + a, &lda, + &beta, + c, &ldc); + } + template<> + void + HostBlas::trsm(const char side, const char uplo, const char transa, const char diag, + int m, int n, + const double alpha, + const double *a, int lda, + /* */ double *b, int ldb) { + F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, + &m, &n, + &alpha, + a, &lda, + b, &ldb); + } + + /// + /// std::complex + /// + + template<> + void + HostBlas >::scal(int n, + const std::complex alpha, + /* */ std::complex *x, int x_inc) { + F77_FUNC_CSCAL(&n, &alpha, x, &x_inc); + } + template<> + int + HostBlas >::iamax(int n, + const std::complex *x, int x_inc) { + F77_FUNC_ICAMAX(&n, x, &x_inc); + } + template<> + float + HostBlas >::nrm2(int n, + const std::complex *x, int x_inc) { + F77_FUNC_SCNRM2(&n, x, &x_inc); + } + template<> + float + HostBlas >::asum(int n, + const std::complex *x, int x_inc) { + F77_FUNC_SCASUM(&n, x, &x_inc); + } + template<> + std::complex + HostBlas >::dot(int n, + const std::complex *x, int x_inc, + const std::complex *y, int y_inc) { +# if defined( KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX ) + return F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); +# else + std::complex res; + F77_FUNC_CDOTC(&res, &n, x, &x_inc, y, &y_inc); + return res; +# endif + } + template<> + void + HostBlas >::axpy(int n, + const std::complex alpha, + const std::complex *x, int x_inc, + /* */ std::complex *y, int y_inc) { + F77_FUNC_CAXPY(&n, &alpha, x, &x_inc, y, &y_inc); + } + + template<> + void + HostBlas >::gemv(const char trans, + int m, int n, + const std::complex alpha, + const std::complex *a, int lda, + const std::complex *b, int ldb, + const std::complex beta, + /* */ std::complex *c, int ldc) { + F77_FUNC_CGEMV(&trans, + &m, &n, + &alpha, + (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, + &beta, + ( std::complex*)c, &ldc); + } + template<> + void + HostBlas >::trsv(const char uplo, const char transa, const char diag, + int m, + const std::complex *a, int lda, + /* */ std::complex *b, int ldb) { + F77_FUNC_CTRSV(&uplo, &transa, &diag, + &m, + (const std::complex*)a, &lda, + ( std::complex*)b, &ldb); + } + template<> + void + HostBlas >::gemm(const char transa, const char transb, + int m, int n, int k, + const std::complex alpha, + const std::complex *a, int lda, + const std::complex *b, int ldb, + const std::complex beta, + /* */ std::complex *c, int ldc) { + F77_FUNC_CGEMM(&transa, &transb, + &m, &n, &k, + &alpha, + (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, + &beta, + ( std::complex*)c, &ldc); + } + template<> + void + HostBlas >::herk(const char transa, const char transb, + int n, int k, + const std::complex alpha, + const std::complex *a, int lda, + const std::complex beta, + /* */ std::complex *c, int ldc) { + F77_FUNC_CHERK(&transa, &transb, + &n, &k, + &alpha, + (const std::complex*)a, &lda, + &beta, + ( std::complex*)c, &ldc); + } + template<> + void + HostBlas >::trsm(const char side, const char uplo, const char transa, const char diag, + int m, int n, + const std::complex alpha, + const std::complex *a, int lda, + /* */ std::complex *b, int ldb) { + F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, + &m, &n, + &alpha, + (const std::complex*)a, &lda, + ( std::complex*)b, &ldb); + } + + /// + /// std::complex + /// + + + template<> + void + HostBlas >::scal(int n, + const std::complex alpha, + /* */ std::complex *x, int x_inc) { + F77_FUNC_ZSCAL(&n, &alpha, x, &x_inc); + } + template<> + int + HostBlas >::iamax(int n, + const std::complex *x, int x_inc) { + F77_FUNC_IZAMAX(&n, x, &x_inc); + } + template<> + double + HostBlas >::nrm2(int n, + const std::complex *x, int x_inc) { + F77_FUNC_DZNRM2(&n, x, &x_inc); + } + template<> + double + HostBlas >::asum(int n, + const std::complex *x, int x_inc) { + F77_FUNC_DZASUM(&n, x, &x_inc); + } + template<> + std::complex + HostBlas >::dot(int n, + const std::complex *x, int x_inc, + const std::complex *y, int y_inc) { +# if defined( KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX ) + return F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); +# else + std::complex res; + F77_FUNC_ZDOTC(&res, &n, x, &x_inc, y, &y_inc); + return res; +# endif + } + template<> + void + HostBlas >::axpy(int n, + const std::complex alpha, + const std::complex *x, int x_inc, + /* */ std::complex *y, int y_inc) { + F77_FUNC_ZAXPY(&n, &alpha, x, &x_inc, y, &y_inc); + } + + template<> + void + HostBlas >::gemv(const char trans, + int m, int n, + const std::complex alpha, + const std::complex *a, int lda, + const std::complex *b, int ldb, + const std::complex beta, + /* */ std::complex *c, int ldc) { + F77_FUNC_ZGEMV(&trans, + &m, &n, + &alpha, + (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, + &beta, + ( std::complex*)c, &ldc); + } + template<> + void + HostBlas >::trsv(const char uplo, const char transa, const char diag, + int m, + const std::complex *a, int lda, + /* */ std::complex *b, int ldb) { + F77_FUNC_ZTRSV(&uplo, &transa, &diag, + &m, + (const std::complex*)a, &lda, + ( std::complex*)b, &ldb); + } + template<> + void + HostBlas >::gemm(const char transa, const char transb, + int m, int n, int k, + const std::complex alpha, + const std::complex *a, int lda, + const std::complex *b, int ldb, + const std::complex beta, + /* */ std::complex *c, int ldc) { + F77_FUNC_ZGEMM(&transa, &transb, + &m, &n, &k, + &alpha, + (const std::complex*)a, &lda, + (const std::complex*)b, &ldb, + &beta, + ( std::complex*)c, &ldc); + } + template<> + void + HostBlas >::herk(const char transa, const char transb, + int n, int k, + const std::complex alpha, + const std::complex *a, int lda, + const std::complex beta, + /* */ std::complex *c, int ldc) { + F77_FUNC_ZHERK(&transa, &transb, + &n, &k, + &alpha, + (const std::complex*)a, &lda, + &beta, + ( std::complex*)c, &ldc); + } + template<> + void + HostBlas >::trsm(const char side, const char uplo, const char transa, const char diag, + int m, int n, + const std::complex alpha, + const std::complex *a, int lda, + /* */ std::complex *b, int ldb) { + F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, + &m, &n, + &alpha, + (const std::complex*)a, &lda, + ( std::complex*)b, &ldb); + } + + } +} +#endif diff --git a/src/impl/tpls/KokkosBlas_Host_tpl.hpp b/src/impl/tpls/KokkosBlas_Host_tpl.hpp new file mode 100644 index 0000000000..513197eebc --- /dev/null +++ b/src/impl/tpls/KokkosBlas_Host_tpl.hpp @@ -0,0 +1,95 @@ +#ifndef KOKKOSBLAS_HOST_TPL_HPP_ +#define KOKKOSBLAS_HOST_TPL_HPP_ + +/// \file KokkosBlas_Host_tpl.hpp +/// \brief BLAS wrapper +/// \author Kyungjoo Kim (kyukim@sandia.gov) + + +#include "KokkosKernels_config.h" +#include "Kokkos_ArithTraits.hpp" + +#if defined( KOKKOSKERNELS_ENABLE_TPL_BLAS ) + +namespace KokkosBlas { + namespace Impl { + + template + struct HostBlas { + typedef Kokkos::ArithTraits ats; + typedef typename ats::mag_type mag_type; + + static + void scal(int n, + const T alpha, + /* */ T *x, int x_inc); + + static + int iamax(int n, + const T *x, int x_inc); + + static + mag_type nrm2(int n, + const T *x, int x_inc); + + static + mag_type asum(int n, + const T *x, int x_inc); + + static + T dot(int n, + const T *x, int x_inc, + const T *y, int y_inc); + + static + void axpy(int n, + const T alpha, + const T *x, int x_inc, + /* */ T *y, int y_inc); + + static + void gemv(const char trans, + int m, int n, + const T alpha, + const T *a, int lda, + const T *b, int ldb, + const T beta, + /* */ T *c, int ldc); + + static + void trsv(const char uplo, const char transa, const char diag, + int m, + const T *a, int lda, + /* */ T *b, int ldb); + + static + void gemm(const char transa, const char transb, + int m, int n, int k, + const T alpha, + const T *a, int lda, + const T *b, int ldb, + const T beta, + /* */ T *c, int ldc); + + static + void herk(const char transa, const char transb, + int n, int k, + const T alpha, + const T *a, int lda, + const T beta, + /* */ T *c, int ldc); + + static + void trsm(const char side, const char uplo, const char transa, const char diag, + int m, int n, + const T alpha, + const T *a, int lda, + /* */ T *b, int ldb); + }; + + } +} + +#endif + +#endif From 0570193faf8f94db053c69bc562c873590108961 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Wed, 23 Jan 2019 16:33:36 -0700 Subject: [PATCH 38/47] KokkosBlas - add options for blas mangling --- Makefile.kokkos-kernels | 2 +- scripts/generate_makefile.bash | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Makefile.kokkos-kernels b/Makefile.kokkos-kernels index d959d9d747..a68eb2ebb0 100644 --- a/Makefile.kokkos-kernels +++ b/Makefile.kokkos-kernels @@ -97,7 +97,7 @@ tmp := $(shell echo "\#if !defined(F77_BLAS_MANGLE)" >> KokkosKernels_config.tmp tmp := $(shell echo "\#define F77_BLAS_MANGLE(name,NAME) $(KOKKOSKERNELS_FORTRAN_GLOBAL)" >> KokkosKernels_config.tmp) tmp := $(shell echo "\#endif" >> KokkosKernels_config.tmp ) -KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX ?= 0 +KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-return-complex" | wc -l)) ifeq ($(KOKKOSKERNELS_INTERNAL_TPL_BLAS_RETURN_COMPLEX), 1) tmp := $(shell echo "\#define KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX" >> KokkosKernels_config.tmp ) endif diff --git a/scripts/generate_makefile.bash b/scripts/generate_makefile.bash index 3c92608ff3..20be08abfb 100755 --- a/scripts/generate_makefile.bash +++ b/scripts/generate_makefile.bash @@ -141,7 +141,12 @@ do echo "--with-layouts=[LAYOUTS]: Set layouts to be instantiated (LayoutLeft,LayoutRight)." echo "--prefix=/Install/Path: Path to install the Kokkos library." echo "--with-options=[OPT]: Set KokkosKernels Options:" - echo " eti_only: only allow ETI types to be enabled [default]" + echo " eti-only: only allow ETI types to be enabled [default]" + echo " manual overriding for fortran blas mangling:" + echo " blas-mangle, blas-mangle_[default], blas-mangle__" + echo " manual overriding for blas complex interface" + echo " blas-return-complex: e.g., ret = zdotc(&N, a, &inc_a, b, &inc_b)" + echo " otherwise, the interface would search zdotc(&ret, &N, a, &inc_a, b, &inc_b)" echo "--with-tpls=[OPT]: Set KokkosKernels TPLs:" echo " mkl,blas,cublas,cusparse" echo "" From 3477147c99c99975c5cfb6d52d2a769157f2864d Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Wed, 23 Jan 2019 17:01:08 -0700 Subject: [PATCH 39/47] KokkosBlas - fixing typos --- Makefile.kokkos-kernels | 18 ++++++++++-------- .../tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 4 ++-- src/impl/tpls/KokkosBlas_Host_tpl.cpp | 12 ++++++------ 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/Makefile.kokkos-kernels b/Makefile.kokkos-kernels index a68eb2ebb0..a22aa38ea5 100644 --- a/Makefile.kokkos-kernels +++ b/Makefile.kokkos-kernels @@ -76,17 +76,19 @@ tmp := $(shell echo "\#define KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.t #With Makefile build, we rely on users' input # default mangling scheme with a single under score -KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle_" | wc -l)) -ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE), 1) - KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_ -endif KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle__" | wc -l)) ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES), 1) KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#__ -endif -KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle" | wc -l)) -ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE), 1) - KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\# +else + KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle_" | wc -l)) + ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_SINGLE_UNDERSCORE), 1) + KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_ + else + KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle" | wc -l)) + ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_NO_UNDERSCORE), 1) + KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\# + endif + endif endif tmp := $(shell echo "" >> KokkosKernels_config.tmp) diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index f664a6b4fd..548ede0273 100644 --- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -185,8 +185,8 @@ struct Axpby< \ const std::complex alpha_val = alpha; \ HostBlas >::axpy \ (N,alpha_val, \ - reinterpret_cast*>(X.data()),one, \ - reinterpret_cast* >(Y.data()),one); \ + reinterpret_cast*>(X.data()),one, \ + reinterpret_cast*>(Y.data()),one); \ } else \ Axpby::axpby(alpha,X,beta,Y); \ Kokkos::Profiling::popRegion(); \ diff --git a/src/impl/tpls/KokkosBlas_Host_tpl.cpp b/src/impl/tpls/KokkosBlas_Host_tpl.cpp index 19de2e5d02..a42fb62e25 100644 --- a/src/impl/tpls/KokkosBlas_Host_tpl.cpp +++ b/src/impl/tpls/KokkosBlas_Host_tpl.cpp @@ -577,19 +577,19 @@ namespace KokkosBlas { int HostBlas >::iamax(int n, const std::complex *x, int x_inc) { - F77_FUNC_ICAMAX(&n, x, &x_inc); + return F77_FUNC_ICAMAX(&n, x, &x_inc); } template<> float HostBlas >::nrm2(int n, const std::complex *x, int x_inc) { - F77_FUNC_SCNRM2(&n, x, &x_inc); + return F77_FUNC_SCNRM2(&n, x, &x_inc); } template<> float HostBlas >::asum(int n, const std::complex *x, int x_inc) { - F77_FUNC_SCASUM(&n, x, &x_inc); + return F77_FUNC_SCASUM(&n, x, &x_inc); } template<> std::complex @@ -703,19 +703,19 @@ namespace KokkosBlas { int HostBlas >::iamax(int n, const std::complex *x, int x_inc) { - F77_FUNC_IZAMAX(&n, x, &x_inc); + return F77_FUNC_IZAMAX(&n, x, &x_inc); } template<> double HostBlas >::nrm2(int n, const std::complex *x, int x_inc) { - F77_FUNC_DZNRM2(&n, x, &x_inc); + return F77_FUNC_DZNRM2(&n, x, &x_inc); } template<> double HostBlas >::asum(int n, const std::complex *x, int x_inc) { - F77_FUNC_DZASUM(&n, x, &x_inc); + return F77_FUNC_DZASUM(&n, x, &x_inc); } template<> std::complex From f067ae84442d6f34e6e737ac132ad1cd4463ef47 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Thu, 24 Jan 2019 12:13:15 -0700 Subject: [PATCH 40/47] KokkosBlas - cmake done --- CheckHostBlasReturnComplex.cmake | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 CheckHostBlasReturnComplex.cmake diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake new file mode 100644 index 0000000000..06418676d3 --- /dev/null +++ b/CheckHostBlasReturnComplex.cmake @@ -0,0 +1,38 @@ +INCLUDE(CheckCXXSourceRuns) + +FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) + + SET(CMAKE_REQUIRED_LIBRARIES ${TPL_BLAS_LIBRARIES}) + + SET(SOURCE + " +#include + +#define F77_BLAS_MANGLE${F77_BLAS_MANGLE} + +extern \"C\" { + std::complex F77_BLAS_MANGLE(zdotc,ZDOTC)( + const int* n, + const std::complex x[], const int* incx, + const std::complex y[], const int* incy); +} + +int main() { + const int NUM=2; + const int INC=1; + std::complex f[NUM]; + const std::complex + ONE = std::complex(0.0,1.0), + TWO = std::complex(0.0,2.0); + f[0] = ONE; + f[1] = TWO; + std::complex ret + = F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC); + return (ret.real() == double(5.0) ? 1 : 0); +} + " + ) + + CHECK_CXX_SOURCE_RUNS("${SOURCE}" ${VARNAME}) + +ENDFUNCTION() From 5f4ba6beaf3578728f5c08958af2309727754517 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Thu, 24 Jan 2019 12:24:36 -0700 Subject: [PATCH 41/47] KokkosBlas CMake fix --- CMakeLists.txt | 9 +++++++++ Makefile.kokkos-kernels | 1 + cmake/KokkosKernels_config.h.in | 9 +++++++++ scripts/test_all_sandia | 2 ++ src/CMakeLists.txt | 6 ++++++ 5 files changed, 27 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb8b7757ca..a674cf69d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -323,6 +323,15 @@ IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS) LIST(APPEND TPL_LIST "CUBLAS") ENDIF() +# ================================================================== +# Fortran Complex BLAS +# ================================================================== + +IF (KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS) + INCLUDE(CheckHostBlasReturnComplex.cmake) + CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) +ENDIF() + # ================================================================== # CMake Summary # ================================================================== diff --git a/Makefile.kokkos-kernels b/Makefile.kokkos-kernels index a22aa38ea5..49b26befc0 100644 --- a/Makefile.kokkos-kernels +++ b/Makefile.kokkos-kernels @@ -76,6 +76,7 @@ tmp := $(shell echo "\#define KOKKOSKERNELS_CONFIG_H_" >> KokkosKernels_config.t #With Makefile build, we rely on users' input # default mangling scheme with a single under score +KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#_ KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES := $(strip $(shell echo $(KOKKOSKERNELS_OPTIONS) | grep "blas-mangle__" | wc -l)) ifeq ($(KOKKOSKERNELS_INTERNAL_OVERRIDE_FORTRAN_MANGLING_WITH_DOUBLE_UNDERSCORES), 1) KOKKOSKERNELS_FORTRAN_GLOBAL = name\#\#__ diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 67a9c8f501..f83eec704f 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -1,6 +1,15 @@ #ifndef KOKKOSKERNELS_CONFIG_H #define KOKKOSKERNELS_CONFIG_H + +/* Define Fortran mangle from Trilinos macro definition */ +#ifndef F77_BLAS_MANGLE +# define F77_BLAS_MANGLE@F77_BLAS_MANGLE@ +#endif + +/* Define if fortran blas 1 function can return complex type */ +#cmakedefine KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX + /* Define if building in debug mode */ #cmakedefine HAVE_KOKKOSKERNELS_DEBUG diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 76d4fe52d0..8aa081958a 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -248,6 +248,8 @@ elif [ "$MACHINE" = "white" ]; then module load netlib/3.8.0/gcc export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a" + # blas is compiled by gcc so use single underscore and power8 use return complex + KOKKOS_OPTIONS="eti-only,blas-mangle_,blas-return-complex" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ce5f3c9824..3b4914ae84 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,6 +23,12 @@ APPEND_GLOB(HEADERS ${DIR}/*.hpp) SET_AND_INC_DIRS(DIR ${CMAKE_CURRENT_SOURCE_DIR}/blas/impl) APPEND_GLOB(HEADERS ${DIR}/*.hpp) +#Blas host wrapper +IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL) + APPEND_GLOB(HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/tpls/KokkosBlas_Host_tpl.hpp) + APPEND_GLOB(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/impl/tpls/KokkosBlas_Host_tpl.cpp) +ENDIF() + #Include Sparse SET_AND_INC_DIRS(DIR ${CMAKE_CURRENT_SOURCE_DIR}/sparse) From 8d8e9261e26b3ab5399cf449c24aab781de545dc Mon Sep 17 00:00:00 2001 From: William McLendon Date: Thu, 24 Jan 2019 15:48:44 -0700 Subject: [PATCH 42/47] Fix shadowed declarations I was getting a lot of compiler errors when I had -Wshadow enabled so I fixed them to make it compile. --- .../impl/KokkosSparse_spgemm_imp_outer.hpp | 4 +- .../KokkosSparse_spgemm_impl_compression.hpp | 28 ++++++------- .../impl/KokkosSparse_spgemm_impl_def.hpp | 10 ++--- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 24 +++++------ .../impl/KokkosSparse_spgemm_impl_speed.hpp | 6 +-- .../KokkosSparse_spgemm_impl_symbolic.hpp | 42 ++++++++++--------- 6 files changed, 58 insertions(+), 56 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp index 54ab8d5cea..16d3d39dae 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp @@ -881,7 +881,7 @@ KokkosSPGEMM_numeric_outer( c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, c_scalar_nnz_view_t &valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space){ + KokkosKernels::Impl::ExecSpaceType my_exec_space_){ throw std::runtime_error ("Cannot run outer product. ENABLE openmp and outer product to run\n"); } #endif @@ -898,7 +898,7 @@ void KokkosSPGEMM c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, c_scalar_nnz_view_t &valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space){ + KokkosKernels::Impl::ExecSpaceType my_exec_space_){ throw std::runtime_error ("Cannot run outer product. ENABLE openmp and outer product to run\n"); } #endif diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 784b5bb73b..4fcfa0191e 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -603,9 +603,9 @@ struct KokkosSPGEMM }); //here we find the column_set indices for each column. - //if it is integer, we divide the column index by 32 with shifts. + //if it is integer, we divide the column index by 32 with shifts. //result_keys is an array of size vector_size - //this is used as hashtable. + //this is used as hashtable. Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, work_to_handle), [&] (nnz_lno_t i) { @@ -624,14 +624,14 @@ struct KokkosSPGEMM else if (result_keys[new_hash] == r){ if (Kokkos::atomic_compare_exchange_strong(result_keys + new_hash, r, n_set_index)){ //MD 4/4/18: one these architectures there can be divergence in the warp. - //once the keys are set, some other vector lane might be doing a + //once the keys are set, some other vector lane might be doing a //fetch_or before we set with n_set. Therefore it is necessary to do //atomic, and set it with zero as above. #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_VOLTA70) || defined(KOKKOS_ARCH_VOLTA72) Kokkos::atomic_fetch_or(result_vals + new_hash, n_set); #else result_vals[new_hash] = n_set; -#endif +#endif break; } } @@ -655,7 +655,7 @@ struct KokkosSPGEMM nnz_lno_t hash = n_set_index & shared_memory_hash_func;//% shmem_hash_size; if (n_set_index == -1) hash = -1; num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( - teamMember, vector_size, hash,n_set_index, + teamMember, vector_size, hash,n_set_index, n_set, used_hash_sizes, shmem_hash_size); }, overall_num_unsuccess); @@ -689,8 +689,8 @@ struct KokkosSPGEMM if (num_unsuccess) hash = n_set_index & (pow2_hash_func); //this parallel_for is not really needed. - //we just need a sync threads at the end of the insertion. - //Basically, we do not want + //we just need a sync threads at the end of the insertion. + //Basically, we do not want //new_row_map(row_ind) = rowBeginP + used_hash_sizes[0] + used_hash_sizes[1]; //to execute before the below insertion finishes. //parallel_for will provide this mechanism. @@ -764,7 +764,7 @@ bool KokkosSPGEMM out_nnz_view_t &out_nnz_sets, bool compress_in_single_step){ //get the execution space type. - KokkosKernels::Impl::ExecSpaceType my_exec_space = this->handle->get_handle_exec_space(); + KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); //get the suggested vectorlane size based on the execution space, and average number of nnzs per row. int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz); //get the suggested team size. @@ -795,7 +795,7 @@ bool KokkosSPGEMM out_nnz_view_t set_nexts_; out_nnz_view_t set_begins_; #ifdef KOKKOSKERNELSMOREMEM - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ set_nexts_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_nexts_"), nnz); set_begins_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_begins_"), nnz); Kokkos::deep_copy (set_begins_, -1); @@ -809,7 +809,7 @@ bool KokkosSPGEMM //if compressing in single step, allocate the memory as upperbound. //TODO: two step is not there for cuda. - if (compress_in_single_step || my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (compress_in_single_step || lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ out_nnz_indices = out_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("set_entries_"), nnz); out_nnz_sets = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_indices_"), nnz); } @@ -831,14 +831,14 @@ bool KokkosSPGEMM shmem_size, //shared memory size. team_row_chunk_size //chunksize. ,suggested_team_size, KOKKOSKERNELS_VERBOSE, - my_exec_space + lcl_my_exec_space ); double min_reduction = this->handle->get_spgemm_handle()->get_compression_cut_off(); size_t OriginaltotalFlops = this->handle->get_spgemm_handle()->original_overall_flops; timer1.reset(); //bool compression_applied = false; - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ #ifndef KOKKOSKERNELSMOREMEM size_type max_row_nnz = 0; @@ -861,7 +861,7 @@ bool KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { size_t free_byte ; size_t total_byte ; @@ -1030,7 +1030,7 @@ bool KokkosSPGEMM } return true; -} +} // compressMatrix (end) } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index 2ceafde1b0..b8cac5e17f 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -62,7 +62,7 @@ void KokkosSPGEMM //get the algorithm and execution space. //SPGEMMAlgorithm spgemm_algorithm = this->handle->get_spgemm_handle()->get_algorithm_type(); - KokkosKernels::Impl::ExecSpaceType my_exec_space = KokkosKernels::Impl::get_exec_space_type(); + KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); if (KOKKOSKERNELS_VERBOSE){ std::cout << "Numeric PHASE" << std::endl; @@ -70,10 +70,10 @@ void KokkosSPGEMM if (spgemm_algorithm == SPGEMM_KK_SPEED || spgemm_algorithm == SPGEMM_KK_DENSE) { - this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, my_exec_space); + this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, my_exec_space_); } else { - this->KokkosSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_, my_exec_space); + this->KokkosSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_, my_exec_space_); } } @@ -120,11 +120,11 @@ void KokkosSPGEMM //number of rows and nnzs nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); - KokkosKernels::Impl::ExecSpaceType my_exec_space = KokkosKernels::Impl::get_exec_space_type(); + KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step(); //compress in single step if it is cuda execution space. - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA) { compress_in_single_step = true; } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index ddfac39491..9dc00a4910 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -1122,7 +1122,7 @@ void c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space){ + KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space){ if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tHASH MODE" << std::endl; @@ -1183,7 +1183,7 @@ void //choose parameters if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ //then chose the best method and parameters. size_type average_row_nnz = overall_nnz / this->a_row_cnt; size_t average_row_flops = original_overall_flops / this->a_row_cnt; @@ -1299,7 +1299,7 @@ void rowmapC_, entriesC_, valuesC_, - my_exec_space); + lcl_my_exec_space); return; } } @@ -1313,7 +1313,7 @@ void //required memory for L2 - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ tmp_max_nnz = 1; @@ -1356,7 +1356,7 @@ void int num_chunks = concurrency / suggested_vector_size; #if defined( KOKKOS_ENABLE_CUDA ) - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { size_t free_byte ; size_t total_byte ; @@ -1390,7 +1390,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1428,7 +1428,7 @@ void min_hash_size, max_nnz, suggested_team_size, - my_exec_space, + lcl_my_exec_space, team_row_chunk_size, first_level_cut_off, flops_per_row, KOKKOSKERNELS_VERBOSE); @@ -1439,7 +1439,7 @@ void } timer1.reset(); - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_SPREADTEAM", gpu_team_policy4_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc); MyExecSpace::fence(); @@ -1498,7 +1498,7 @@ void c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space){ + KokkosKernels::Impl::ExecSpaceType my_exec_space_){ if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tHASH MODE" << std::endl; } @@ -1534,7 +1534,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1573,7 +1573,7 @@ void min_hash_size, max_nnz, suggested_team_size, - my_exec_space, + my_exec_space_, team_row_chunk_size, first_level_cut_off, this->handle->get_spgemm_handle()->row_flops, KOKKOSKERNELS_VERBOSE); @@ -1584,7 +1584,7 @@ void } timer1.reset(); - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc); MyExecSpace::fence(); } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 672b6510ee..4c5270a29a 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -480,7 +480,7 @@ void c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space){ + KokkosKernels::Impl::ExecSpaceType my_exec_space_){ if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tSPEED MODE" << std::endl; @@ -496,7 +496,7 @@ void Kokkos::Impl::Timer numeric_speed_timer_with_free; - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ //allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC (Kokkos::ViewAllocateWithoutInitializing("C keys"), valuesC_.extent(0)); @@ -599,7 +599,7 @@ void entriesC_, valuesC_, m_space, - my_exec_space, + my_exec_space_, team_row_chunk_size); MyExecSpace::fence(); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 7deb57ca9a..cebc15b281 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1516,8 +1516,8 @@ void KokkosSPGEMM ){ SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm; - KokkosKernels::Impl::ExecSpaceType my_exec_space = this->handle->get_handle_exec_space(); - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) + KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; } @@ -1534,7 +1534,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1545,7 +1545,7 @@ void KokkosSPGEMM if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1658,13 +1658,13 @@ void KokkosSPGEMM //initizalize value for the mem pool nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } #if defined( KOKKOS_ENABLE_CUDA ) - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { size_t free_byte ; size_t total_byte ; cudaMemGetInfo( &free_byte, &total_byte ) ; @@ -1714,7 +1714,8 @@ void KokkosSPGEMM team_row_chunk_size, suggested_vector_size, m_space, - my_exec_space,KOKKOSKERNELS_VERBOSE + lcl_my_exec_space, + KOKKOSKERNELS_VERBOSE ); if (KOKKOSKERNELS_VERBOSE){ @@ -1727,7 +1728,7 @@ void KokkosSPGEMM timer1.reset(); - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { Kokkos::parallel_for("StructureC_NC::CUDA_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -1781,7 +1782,7 @@ void KokkosSPGEMM Kokkos::deep_copy (h_c_nnz_size, d_c_nnz_size); typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size(); this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size); -} +} // end: symbolic_c_no_compression template spgemm_algorithm; - KokkosKernels::Impl::ExecSpaceType my_exec_space = this->handle->get_handle_exec_space(); - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ current_spgemm_algorithm = SPGEMM_KK_MEMORY; } @@ -1819,7 +1820,7 @@ void KokkosSPGEMM nnz_lno_t brows = row_mapB_.extent(0) - 1; size_type bnnz = entriesSetIndex.extent(0); size_type compressed_b_size = bnnz; - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ KokkosKernels::Impl::kk_reduce_diff_view (brows, old_row_mapB, row_mapB_, compressed_b_size); if (KOKKOSKERNELS_VERBOSE){ @@ -1829,7 +1830,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, compressed_b_size); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1840,7 +1841,7 @@ void KokkosSPGEMM int shmem_size_to_use = shmem_size; if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1970,7 +1971,7 @@ void KokkosSPGEMM } - if (current_spgemm_algorithm == SPGEMM_KK_DENSE && my_exec_space != KokkosKernels::Impl::Exec_CUDA){ + if (current_spgemm_algorithm == SPGEMM_KK_DENSE && lcl_my_exec_space != KokkosKernels::Impl::Exec_CUDA){ nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1; nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros); chunksize = col_size + max_row_size; @@ -1985,13 +1986,13 @@ void KokkosSPGEMM nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } #if defined( KOKKOS_ENABLE_CUDA ) - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { size_t free_byte ; size_t total_byte ; cudaMemGetInfo( &free_byte, &total_byte ) ; @@ -2041,7 +2042,8 @@ void KokkosSPGEMM team_row_chunk_size, suggested_vector_size, m_space, - my_exec_space,KOKKOSKERNELS_VERBOSE + lcl_my_exec_space, + KOKKOSKERNELS_VERBOSE ); if (KOKKOSKERNELS_VERBOSE){ @@ -2053,7 +2055,7 @@ void KokkosSPGEMM timer1.reset(); - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { Kokkos::parallel_for("KokkosSparse::StructureC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -2283,7 +2285,7 @@ void KokkosSPGEMM Kokkos::deep_copy (h_c_nnz_size, d_c_nnz_size); typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size(); this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size); -} +} // symbolic_c (end) template Date: Thu, 24 Jan 2019 18:19:16 -0700 Subject: [PATCH 43/47] test_all_sandia gcc 7.x version GCC 7.3.0 isn't available on sandia RHEL7 machines. Additionally, since we're stuck with supporting gcc 7.2.0 for the foreseeable future since Trilinos must support gcc 7.2.0 because that is the gcc 7.x compiler available on at least one machine needed by customers, we should probably be testing kokkos-kernels with gcc 7.2.0. --- scripts/test_all_sandia | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/test_all_sandia b/scripts/test_all_sandia index 8aa081958a..d723e68a52 100755 --- a/scripts/test_all_sandia +++ b/scripts/test_all_sandia @@ -212,7 +212,7 @@ if [ "$MACHINE" = "sems" ]; then if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "clang/4.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" @@ -223,7 +223,7 @@ if [ "$MACHINE" = "sems" ]; then "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" From 35ff4063dc357151af94bba60fdfad7b3a443b2e Mon Sep 17 00:00:00 2001 From: William McLendon Date: Tue, 29 Jan 2019 10:24:43 -0700 Subject: [PATCH 44/47] Add header guards to HashmapAccumulator --- src/common/KokkosKernels_HashmapAccumulator.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp index f1bd94cd94..09ea1e9151 100644 --- a/src/common/KokkosKernels_HashmapAccumulator.hpp +++ b/src/common/KokkosKernels_HashmapAccumulator.hpp @@ -40,6 +40,8 @@ // ************************************************************************ //@HEADER */ +#ifndef _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP +#define _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP #include #include @@ -739,4 +741,8 @@ struct HashmapAccumulator } // namespace Experimental -} // namespace KokkosKernels \ No newline at end of file +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP + + From 43d3f4a9793477237e6288ff2e27cf175d9ed732 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 29 Jan 2019 15:41:41 -0700 Subject: [PATCH 45/47] Fix return code for BLAS complex api check Follow-on to PR #374 --- CMakeLists.txt | 2 +- CheckHostBlasReturnComplex.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a674cf69d3..ef6b2a782b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -327,7 +327,7 @@ ENDIF() # Fortran Complex BLAS # ================================================================== -IF (KOKKOSKERNELS_INTERNAL_ENABLE_HOST_BLAS) +IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL) INCLUDE(CheckHostBlasReturnComplex.cmake) CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) ENDIF() diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index 06418676d3..f7a74ce2be 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -28,7 +28,7 @@ int main() { f[1] = TWO; std::complex ret = F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC); - return (ret.real() == double(5.0) ? 1 : 0); + return (ret.real() == double(5.0) ? 0 : 1); } " ) From 1767da405d8c26403dcfc067d51356c9f4f4d2e5 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 29 Jan 2019 17:46:44 -0700 Subject: [PATCH 46/47] Fix GEMM scratch memory size Addresses issue #368 reported by @etphipp, matches patch in trilinos/Trilinos#4168 scratch_memory_size should call shmem_size rather than required_allocation_size. --- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp index d293df04cf..e68d03116c 100644 --- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -510,9 +510,9 @@ struct GEMMImpl { void run(int team_size, int vector_length, int scr_level) { scratch_level = scr_level; int scratch_memory_size = - ViewTypeAScratch::required_allocation_size() + - ViewTypeBScratch::required_allocation_size() + - ViewTypeCScratch::required_allocation_size(); + ViewTypeAScratch::shmem_size() + + ViewTypeBScratch::shmem_size() + + ViewTypeCScratch::shmem_size(); Kokkos::TeamPolicy> policy(num_blocks_0*num_blocks_1,team_size,vector_length); From 6a7903217bf3257c8646419cbdf116a139c01c86 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 5 Feb 2019 17:09:02 -0700 Subject: [PATCH 47/47] Adding Changelog for Release 2.8.00 Part of Kokkos C++ Performance Portability Programming EcoSystem 2.8 --- CHANGELOG.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed12fd9c6f..d6255e623b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Change Log +## [2.8.00](https://github.com/kokkos/kokkos-kernels/tree/2.8.00) (2019-02-05) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.24...2.8.00) + +**Implemented enhancements:** + +- Capability, Tests: C++14 Support and Testing [\#351](https://github.com/kokkos/kokkos-kernels/issues/351) +- Capability: Batched getrs [\#332](https://github.com/kokkos/kokkos-kernels/issues/332) +- More Kernel Labels for KokkosBlas [\#239](https://github.com/kokkos/kokkos-kernels/issues/239) +- Name all parallel kernels and regions [\#124](https://github.com/kokkos/kokkos-kernels/issues/124) + +**Fixed bugs:** + +- BLAS TPL: BLAS underscore mangling [\#369](https://github.com/kokkos/kokkos-kernels/issues/369) +- BLAS TPL, Complex: Promotion 2.7.24 broke MV unit tests in Tpetra with complex types [\#360](https://github.com/kokkos/kokkos-kernels/issues/360) +- GEMM: GEMM uses wrong function for computing shared memory allocation size [\#368](https://github.com/kokkos/kokkos-kernels/issues/368) +- BuildSystem: BLAS TPL macro not properly enabled with MKL BLAS [\#347](https://github.com/kokkos/kokkos-kernels/issues/347) +- BuildSystem: make clean - errors [\#353](https://github.com/kokkos/kokkos-kernels/issues/353) +- Compiler Workaround: Internal compiler error in KokkosBatched::Experimental::TeamGemm [\#349](https://github.com/kokkos/kokkos-kernels/issues/349) +- KokkosBlas: Some KokkosBlas kernels assume default execution space [\#14](https://github.com/kokkos/kokkos-kernels/issues/14) + ## [2.7.24](https://github.com/kokkos/kokkos-kernels/tree/2.7.24) (2018-11-04) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/2.7.00...2.7.24)