diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 3cabcd0f73..54cc124474 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -45,6 +45,7 @@ #include #include #include +#include #ifndef _SPILUKHANDLE_HPP #define _SPILUKHANDLE_HPP @@ -87,6 +88,12 @@ class SPILUKHandle { typedef typename Kokkos::View nnz_lno_view_t; + typedef typename Kokkos::View + nnz_row_view_host_t; + + typedef typename Kokkos::View + nnz_lno_view_host_t; + typedef typename std::make_signed< typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; typedef Kokkos::View signed_nnz_lno_view_t; + typedef Kokkos::View + work_view_t; + private: nnz_row_view_t level_list; // level IDs which the rows belong to nnz_lno_view_t level_idx; // the list of rows in each level nnz_lno_view_t level_ptr; // the starting index (into the view level_idx) of each level - nnz_lno_view_t level_nchunks; // number of chunks of rows at each level - nnz_lno_view_t + nnz_lno_view_host_t level_nchunks; // number of chunks of rows at each level + nnz_lno_view_host_t level_nrowsperchunk; // maximum number of rows among chunks at each level + work_view_t iw; // working view for mapping dense indices to sparse indices size_type nrows; size_type nlevels; @@ -128,6 +140,7 @@ class SPILUKHandle { level_ptr(), level_nchunks(), level_nrowsperchunk(), + iw(), nrows(nrows_), nlevels(0), nnzL(nnzL_), @@ -147,11 +160,12 @@ class SPILUKHandle { set_nnzU(nnzU_); set_level_maxrows(0); set_level_maxrowsperchunk(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(), - reset_symbolic_complete(); + level_list = nnz_row_view_t("level_list", nrows_), + level_idx = nnz_lno_view_t("level_idx", nrows_), + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + level_nchunks = nnz_lno_view_host_t(), + level_nrowsperchunk = nnz_lno_view_host_t(), reset_symbolic_complete(), + iw = work_view_t(); } virtual ~SPILUKHandle(){}; @@ -170,17 +184,28 @@ class SPILUKHandle { nnz_lno_view_t get_level_ptr() const { return level_ptr; } KOKKOS_INLINE_FUNCTION - nnz_lno_view_t get_level_nchunks() const { return level_nchunks; } + nnz_lno_view_host_t get_level_nchunks() const { return level_nchunks; } void alloc_level_nchunks(const size_type nlevels_) { - level_nchunks = nnz_lno_view_t("level_nchunks", nlevels_); + level_nchunks = nnz_lno_view_host_t("level_nchunks", nlevels_); } KOKKOS_INLINE_FUNCTION - nnz_lno_view_t get_level_nrowsperchunk() const { return level_nrowsperchunk; } + nnz_lno_view_host_t get_level_nrowsperchunk() const { + return level_nrowsperchunk; + } void alloc_level_nrowsperchunk(const size_type nlevels_) { - level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_); + level_nrowsperchunk = nnz_lno_view_host_t("level_nrowsperchunk", nlevels_); + } + + KOKKOS_INLINE_FUNCTION + work_view_t get_iw() const { return iw; } + + void alloc_iw(const size_type nrows_, const size_type ncols_) { + iw = work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + nrows_, ncols_); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); } KOKKOS_INLINE_FUNCTION @@ -238,8 +263,7 @@ class SPILUKHandle { if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; - /* - if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) { + /*if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) { std::cout << "SEQLVLSCHED_TP2" << std::endl;; std::cout << "WARNING: With CUDA this is currently only reliable with int-int ordinal-offset pair" << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index d0b80ace69..4af8606dfb 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -242,52 +242,54 @@ struct ILUKLvlSchedTP1NumericFunctor { KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = level_idx(my_league + lev_start); - auto my_team = team.team_rank(); + nnz_lno_t my_team = static_cast(team.league_rank()); + nnz_lno_t rowid = + static_cast(level_idx(my_team + lev_start)); // map to rowid - auto k1 = L_row_map(rowid); - auto k2 = L_row_map(rowid + 1); + size_type k1 = static_cast(L_row_map(rowid)); + size_type k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const size_type k) { - auto col = L_entries(k); - L_values(k) = 0.0; - iw(my_league, col) = k; + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = static_cast(k); }); #else Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - auto col = L_entries(k); - L_values(k) = 0.0; - iw(my_league, col) = k; + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = static_cast(k); }); #endif #ifdef KEEP_DIAG - if (my_team == 0) L_values(k2 - 1) = scalar_t(1.0); + // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { L_values(k2 - 1) = scalar_t(1.0); }); #endif team.team_barrier(); - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - auto col = U_entries(k); - U_values(k) = 0.0; - iw(my_league, col) = k; + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + iw(my_team, col) = static_cast(k); }); team.team_barrier(); // Unpack the ith row of A - k1 = A_row_map(rowid); - k2 = A_row_map(rowid + 1); + k1 = static_cast(A_row_map(rowid)); + k2 = static_cast(A_row_map(rowid + 1)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - auto col = A_entries(k); - auto ipos = iw(my_league, col); + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t ipos = iw(my_team, col); if (col < rowid) L_values(ipos) = A_values(k); else @@ -297,20 +299,22 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); // Eliminate prev rows - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) { + for (size_type k = k1; k < k2 - 1; k++) #else - for (auto k = k1; k < k2; ++k) { + for (size_type k = k1; k < k2; k++) #endif - auto prev_row = L_entries(k); + { + nnz_lno_t prev_row = L_entries(k); #ifdef KEEP_DIAG - auto fact = L_values(k) / U_values(U_row_map(prev_row)); + scalar_t fact = L_values(k) / U_values(U_row_map(prev_row)); #else - auto fact = L_values(k) * U_values(U_row_map(prev_row)); + scalar_t fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - if (my_team == 0) L_values(k) = fact; + // if (my_thread == 0) L_values(k) = fact; + Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); team.team_barrier(); @@ -318,10 +322,10 @@ struct ILUKLvlSchedTP1NumericFunctor { Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, U_row_map(prev_row + 1)), [&](const size_type kk) { - auto col = U_entries(kk); - auto ipos = iw(my_league, col); + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t ipos = iw(my_team, col); + auto lxu = -U_values(kk) * fact; if (ipos != -1) { - auto lxu = -U_values(kk) * fact; if (col < rowid) Kokkos::atomic_add(&L_values(ipos), lxu); else @@ -332,40 +336,49 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); } // end for k - if (my_team == 0) { + // if (my_thread == 0) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + nnz_lno_t ipos = iw(my_team, rowid); #ifdef KEEP_DIAG - if (U_values(iw(my_league, rowid)) == 0.0) { - U_values(iw(my_league, rowid)) = 1e6; + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; } #else - if (U_values(iw(my_league, rowid)) == 0.0) { - U_values(iw(my_league, rowid)) = 1e6; + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; } else { - U_values(iw(my_league, rowid)) = 1.0 / U_values(iw(my_league, rowid)); + U_values(ipos) = 1.0 / U_values(ipos); } #endif - } + }); + //} team.team_barrier(); // Reset - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { iw(my_league, L_entries(k)) = -1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); #else - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { iw(my_league, L_entries(k)) = -1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); #endif - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { iw(my_league, U_entries(k)) = -1; }); + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(U_entries(k)); + iw(my_team, col) = -1; + }); } }; @@ -379,23 +392,17 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, LValuesType &L_values, const URowMapType &U_row_map, const UEntriesType &U_entries, UValuesType &U_values) { using execution_space = typename IlukHandle::execution_space; - using memory_space = typename IlukHandle::memory_space; using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = - Kokkos::View>; - using LevelHostViewType = Kokkos::View; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; size_type nlevels = thandle.get_num_levels(); - size_type nrows = thandle.get_nrows(); // Keep these as host View, create device version and copy back to host - HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks(); - HandleDeviceEntriesType level_nrowsperchunk = - thandle.get_level_nrowsperchunk(); + HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + HandleDeviceEntriesType level_idx = thandle.get_level_idx(); // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM @@ -409,25 +416,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, level_ptr.extent(0)); Kokkos::deep_copy(level_ptr_h, level_ptr); + //{ if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_nchunks_h = LevelHostViewType( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), - level_nchunks.extent(0)); - level_nrowsperchunk_h = - LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, - "Host level nrowsperchunk"), - level_nrowsperchunk.extent(0)); - Kokkos::deep_copy(level_nchunks_h, level_nchunks); - Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk); - iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), - thandle.get_level_maxrowsperchunk(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); - } else { - iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), - thandle.get_level_maxrows(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); + level_nchunks_h = thandle.get_level_nchunks(); + level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); } + iw = thandle.get_iw(); // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead @@ -476,49 +471,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, else Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nrows_chunk, team_size), tstf); - + Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; } } - // /* - // // TP2 algorithm has issues with some offset-ordinal combo to be - // addressed else if ( thandle.get_algorithm() == - // KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { - // typedef Kokkos::TeamPolicy tvt_policy_type; - // - // int team_size = thandle.get_team_size(); - // if ( team_size == -1 ) { - // team_size = std::is_same< typename - // Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace - // >::value ? 1 : 128; - // } - // int vector_size = thandle.get_team_size(); - // if ( vector_size == -1 ) { - // vector_size = std::is_same< typename - // Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace - // >::value ? 1 : 4; - // } - // - // // This impl: "chunk" lvl_nodes into node_groups; a league_rank - // is responsible for processing that many nodes - // // TeamThreadRange over number of node_groups - // // To avoid masking threads, 1 thread (team) per node in - // node_group - // // ThreadVectorRange responsible for the actual solve - // computation const int node_groups = team_size; - // - // LowerTriLvlSchedTP2SolverFunctor - // tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - // row_count, node_groups); - // Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( - // (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, - // vector_size ), tstf); - // } // end elseif - // */ - } // end if } // end for lvl + //} // Output check #ifdef NUMERIC_OUTPUT_INFO @@ -526,7 +485,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, std::cout << " nnzL: " << thandle.get_nnzL() << std::endl; std::cout << " L_row_map = "; - for (size_type i = 0; i < nrows + 1; ++i) { + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { std::cout << L_row_map(i) << " "; } std::cout << std::endl; @@ -545,7 +504,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, std::cout << " nnzU: " << thandle.get_nnzU() << std::endl; std::cout << " U_row_map = "; - for (size_type i = 0; i < nrows + 1; ++i) { + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { std::cout << U_row_map(i) << " "; } std::cout << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 90bb88e057..691d624963 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -121,15 +121,15 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, // SEQLVLSCHD_TP1 algorithm (chunks) template -void level_sched(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, LevelType1& level_list, - LevelType2& level_ptr, LevelType2& level_idx, - LevelType3& level_nchunks, LevelType3& level_nrowsperchunk, - size_type& nlevels) { + class LevelType1, class LevelType2, class size_type> +void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + size_type& nlevels) { // Scheduling currently compute on host - using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_view_host_t = typename IlukHandle::nnz_lno_view_host_t; size_type nrows = thandle.get_nrows(); @@ -168,11 +168,10 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, level_ptr(0) = 0; // Find max rows, number of chunks, max rows of chunks across levels - using HostViewType = - Kokkos::View; - - HostViewType lnchunks("lnchunks", nlevels); - HostViewType lnrowsperchunk("lnrowsperchunk", nlevels); + thandle.alloc_level_nchunks(nlevels); + thandle.alloc_level_nrowsperchunk(nlevels); + nnz_lno_view_host_t lnchunks = thandle.get_level_nchunks(); + nnz_lno_view_host_t lnrowsperchunk = thandle.get_level_nrowsperchunk(); #ifdef KOKKOS_ENABLE_CUDA using memory_space = typename IlukHandle::memory_space; @@ -214,9 +213,6 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); - - level_nchunks = lnchunks; - level_nrowsperchunk = lnrowsperchunk; } // Linear Search for the smallest row index @@ -326,7 +322,6 @@ void iluk_symbolic(IlukHandle& thandle, HostTmpViewType h_iw("h_iw", nrows); HostTmpViewType h_iL("h_iL", nrows); HostTmpViewType h_llev("h_llev", nrows); - HostTmpViewType level_nchunks, level_nrowsperchunk; size_type cntL = 0; size_type cntU = 0; @@ -472,19 +467,13 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, level_nchunks, level_nrowsperchunk, nlev); - - thandle.alloc_level_nchunks(nlev); - thandle.alloc_level_nrowsperchunk(nlev); - HandleDeviceEntriesType dlevel_nchunks = thandle.get_level_nchunks(); - HandleDeviceEntriesType dlevel_nrowsperchunk = - thandle.get_level_nrowsperchunk(); - Kokkos::deep_copy(dlevel_nchunks, level_nchunks); - Kokkos::deep_copy(dlevel_nrowsperchunk, level_nrowsperchunk); + level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, nlev); + thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); + thandle.alloc_iw(thandle.get_level_maxrows(), nrows); } Kokkos::deep_copy(dlevel_ptr, level_ptr);