diff --git a/src/blas/KokkosBlas1_nrm2w.hpp b/src/blas/KokkosBlas1_nrm2w.hpp index 981897d9ae..43d32e7812 100644 --- a/src/blas/KokkosBlas1_nrm2w.hpp +++ b/src/blas/KokkosBlas1_nrm2w.hpp @@ -76,7 +76,8 @@ nrm2w(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -134,20 +135,21 @@ void nrm2w(const RV& R, const XMV& X, const XMV& W, KokkosKernels::Impl::throw_runtime_exception(os.str()); } + using UnifiedXLayout = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + RV, UnifiedXLayout>::array_layout; + // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; diff --git a/src/blas/KokkosBlas1_nrm2w_squared.hpp b/src/blas/KokkosBlas1_nrm2w_squared.hpp index 2ab07af0c5..6aec955de2 100644 --- a/src/blas/KokkosBlas1_nrm2w_squared.hpp +++ b/src/blas/KokkosBlas1_nrm2w_squared.hpp @@ -77,7 +77,8 @@ nrm2w_squared(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -135,20 +136,21 @@ void nrm2w_squared( KokkosKernels::Impl::throw_runtime_exception(os.str()); } + using UnifiedXLayout = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + RV, UnifiedXLayout>::array_layout; + // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; diff --git a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp index 500dc035ca..dfbae10a99 100644 --- a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -131,7 +131,8 @@ void MV_Dot_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerDot; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -156,7 +157,7 @@ void MV_Dot_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), r.extent(0)); MV_Dot_Invoke(tempResult, x, y); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_dot_spec.hpp b/src/blas/impl/KokkosBlas1_dot_spec.hpp index 350934230d..33c7603057 100644 --- a/src/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/src/blas/impl/KokkosBlas1_dot_spec.hpp @@ -377,6 +377,20 @@ struct Dot + static auto getFirstColumn( + const V& v, typename std::enable_if::type* = nullptr) { + return Kokkos::subview(v, Kokkos::ALL(), 0); + } + + template + static V getFirstColumn( + const V& v, typename std::enable_if::type* = nullptr) { + return v; + } + static void dot(const RV& R, const XV& X, const YV& Y) { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" @@ -392,14 +406,31 @@ struct Dot(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - typedef int index_type; - MV_Dot_Invoke(R, X, Y); + const size_type numDots = std::max(X.extent(1), Y.extent(1)); + if (numDots == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = getFirstColumn(X); + auto Y0 = getFirstColumn(Y); + if (numRows < static_cast(INT_MAX)) { + typedef int index_type; + DotFunctor f(X0, + Y0); + f.run("KokkosBlas::dot<1D>", R0); + } else { + typedef int64_t index_type; + DotFunctor f(X0, + Y0); + f.run("KokkosBlas::dot<1D>", R0); + } } else { - typedef std::int64_t index_type; - MV_Dot_Invoke(R, X, Y); + if (numRows < static_cast(INT_MAX) && + numRows * numDots < static_cast(INT_MAX)) { + typedef int index_type; + MV_Dot_Invoke(R, X, Y); + } else { + typedef std::int64_t index_type; + MV_Dot_Invoke(R, X, Y); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp index 07422035b7..2002ef2c39 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -170,7 +170,8 @@ void MV_Nrm1_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -195,7 +196,7 @@ void MV_Nrm1_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); MV_Nrm1_Invoke(tempResult, x); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp index df86d00fa2..478395d7a9 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -200,12 +200,23 @@ struct Nrm1 { : "KokkosBlas::nrm1[noETI]"); const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm1_Invoke(R, X); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm1_Invoke(R0, X0); + } else { + typedef std::int64_t index_type; + V_Nrm1_Invoke(R0, X0); + } } else { - typedef std::int64_t index_type; - MV_Nrm1_Invoke(R, X); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm1_Invoke(R, X); + } else { + typedef std::int64_t index_type; + MV_Nrm1_Invoke(R, X); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp index 4efc0e6c6d..f2b0e826bc 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -200,7 +200,8 @@ void MV_Nrm2_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -230,7 +231,7 @@ void MV_Nrm2_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); MV_Nrm2_Invoke(tempResult, x, take_sqrt); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp index 340d78fdf1..71afb2ede3 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -200,12 +200,24 @@ struct Nrm2 { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2_Invoke(R, X, take_sqrt); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm2_Invoke(R0, X0, take_sqrt); + } else { + typedef std::int64_t index_type; + V_Nrm2_Invoke(R0, X0, + take_sqrt); + } } else { - typedef std::int64_t index_type; - MV_Nrm2_Invoke(R, X, take_sqrt); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2_Invoke(R, X, take_sqrt); + } else { + typedef std::int64_t index_type; + MV_Nrm2_Invoke(R, X, take_sqrt); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 3013fd17f8..3f202ca430 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -199,7 +199,8 @@ void MV_Nrm2w_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -230,7 +231,7 @@ void MV_Nrm2w_Invoke( r.extent(0)); MV_Nrm2w_Invoke(tempResult, x, w, take_sqrt); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp index fe437bbc5c..28162bce5f 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -201,12 +201,25 @@ struct Nrm2w { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + if (numCols == 1) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + auto W0 = Kokkos::subview(W, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm2w_Invoke(R0, X0, W0, take_sqrt); + } else { + typedef std::int64_t index_type; + V_Nrm2w_Invoke(R0, X0, W0, + take_sqrt); + } } else { - typedef std::int64_t index_type; - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2w_Invoke(R, X, W, take_sqrt); + } else { + typedef std::int64_t index_type; + MV_Nrm2w_Invoke(R, X, W, take_sqrt); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_sum_impl.hpp b/src/blas/impl/KokkosBlas1_sum_impl.hpp index 05cede0f0d..b87f2e1092 100644 --- a/src/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/src/blas/impl/KokkosBlas1_sum_impl.hpp @@ -162,7 +162,8 @@ void MV_Sum_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -187,7 +188,7 @@ void MV_Sum_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), r.extent(0)); MV_Sum_Invoke(tempResult, x); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_sum_spec.hpp b/src/blas/impl/KokkosBlas1_sum_spec.hpp index 505296cab9..09c34299c7 100644 --- a/src/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/src/blas/impl/KokkosBlas1_sum_spec.hpp @@ -197,12 +197,23 @@ struct Sum { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Sum_Invoke(R, X); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Sum_Invoke(R0, X0); + } else { + typedef std::int64_t index_type; + V_Sum_Invoke(R0, X0); + } } else { - typedef std::int64_t index_type; - MV_Sum_Invoke(R, X); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Sum_Invoke(R, X); + } else { + typedef std::int64_t index_type; + MV_Sum_Invoke(R, X); + } } Kokkos::Profiling::popRegion(); } diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 642a0bf5f0..16d54e3dce 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -15,6 +15,8 @@ #include "Test_Blas1_nrm1.hpp" #include "Test_Blas1_nrm2_squared.hpp" #include "Test_Blas1_nrm2.hpp" +#include "Test_Blas1_nrm2w_squared.hpp" +#include "Test_Blas1_nrm2w.hpp" #include "Test_Blas1_nrminf.hpp" #include "Test_Blas1_reciprocal.hpp" #include "Test_Blas1_scal.hpp" diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index 920ac06c77..b2e3f95628 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -196,6 +196,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -207,6 +208,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -218,6 +220,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp index 72861bf5a3..c68492b6dd 100644 --- a/unit_test/blas/Test_Blas1_nrm1.hpp +++ b/unit_test/blas/Test_Blas1_nrm1.hpp @@ -149,6 +149,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif @@ -159,6 +160,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif @@ -169,6 +171,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp index 94d5414e15..688035f842 100644 --- a/unit_test/blas/Test_Blas1_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_nrm2.hpp @@ -144,6 +144,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif @@ -154,6 +155,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif @@ -164,6 +166,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index ca357acdb2..317b9b543b 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -160,6 +160,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif @@ -170,6 +171,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif @@ -180,6 +182,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2w.hpp b/unit_test/blas/Test_Blas1_nrm2w.hpp new file mode 100644 index 0000000000..cda59c83e4 --- /dev/null +++ b/unit_test/blas/Test_Blas1_nrm2w.hpp @@ -0,0 +1,234 @@ +#include +#include +#include +#include +#include + +namespace Test { +template +void impl_test_nrm2w(int N) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + ViewTypeA a("A", N); + ViewTypeA w("W", N); + + typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(h_w, w); + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + typename AT::mag_type expected_result = 0; + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + expected_result += term * term; + } + expected_result = + Kokkos::ArithTraits::sqrt(expected_result); + + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a, w); + EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); +} + +template +void impl_test_nrm2w_mv(int N, int K) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + typedef multivector_layout_adapter vfA_type; + + typename vfA_type::BaseType b_a("A", N, K); + typename vfA_type::BaseType b_w("W", N, K); + + ViewTypeA a = vfA_type::view(b_a); + ViewTypeA w = vfA_type::view(b_w); + + typedef multivector_layout_adapter h_vfA_type; + + typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + + typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(h_b_w, b_w); + + typename AT::mag_type* expected_result = new typename AT::mag_type[K]; + for (int j = 0; j < K; j++) { + expected_result[j] = typename AT::mag_type(); + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + expected_result[j] += term * term; + } + expected_result[j] = + Kokkos::ArithTraits::sqrt(expected_result[j]); + } + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + Kokkos::View r("Dot::Result", K); + KokkosBlas::nrm2w(r, a, w); + auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + + for (int k = 0; k < K; k++) { + typename AT::mag_type nonconst_result = r_host(k); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], + eps * expected_result[k]); + } + + delete[] expected_result; +} +} // namespace Test + +template +int test_nrm2w() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + + return 1; +} + +template +int test_nrm2w_mv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_float"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_float"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_double"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_double"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_complex_double"); + test_nrm2w, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_complex_double"); + test_nrm2w_mv, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_int"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_int"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/unit_test/blas/Test_Blas1_nrm2w_squared.hpp b/unit_test/blas/Test_Blas1_nrm2w_squared.hpp new file mode 100644 index 0000000000..14f1c90766 --- /dev/null +++ b/unit_test/blas/Test_Blas1_nrm2w_squared.hpp @@ -0,0 +1,232 @@ +#include +#include +#include +#include +#include + +namespace Test { +template +void impl_test_nrm2w_squared(int N) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + ViewTypeA a("A", N); + ViewTypeA w("W", N); + + typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(h_w, w); + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + typename AT::mag_type expected_result = 0; + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + expected_result += term * term; + } + + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a, w); + EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); +} + +template +void impl_test_nrm2w_squared_mv(int N, int K) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + typedef multivector_layout_adapter vfA_type; + + typename vfA_type::BaseType b_a("A", N, K); + typename vfA_type::BaseType b_w("W", N, K); + + ViewTypeA a = vfA_type::view(b_a); + ViewTypeA w = vfA_type::view(b_w); + + typedef multivector_layout_adapter h_vfA_type; + + typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + + typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(h_b_w, b_w); + + typename AT::mag_type* expected_result = new typename AT::mag_type[K]; + for (int j = 0; j < K; j++) { + expected_result[j] = typename AT::mag_type(); + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + expected_result[j] += term * term; + } + } + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + Kokkos::View r("Dot::Result", K); + KokkosBlas::nrm2w_squared(r, a, w); + auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + + for (int k = 0; k < K; k++) { + typename AT::mag_type nonconst_result = r_host(k); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], + eps * expected_result[k]); + } + + delete[] expected_result; +} +} // namespace Test + +template +int test_nrm2w_squared() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + + return 1; +} + +template +int test_nrm2w_squared_mv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_float"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_float"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_double"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_double"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::nrm2w_squared_complex_double"); + test_nrm2w_squared, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::nrm2w_squared_mv_complex_double"); + test_nrm2w_squared_mv, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_int"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_int"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp index 768091885c..2b7f51370e 100644 --- a/unit_test/blas/Test_Blas1_sum.hpp +++ b/unit_test/blas/Test_Blas1_sum.hpp @@ -133,6 +133,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif @@ -143,6 +144,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif @@ -153,6 +155,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif