diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CutDrop.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CutDrop.hpp index 1bb2fa1b1648..087a3de61020 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CutDrop.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CutDrop.hpp @@ -423,37 +423,6 @@ class ScaledDistanceLaplacianComparison { } }; -template -KOKKOS_INLINE_FUNCTION void serialHeapSort(view_type& v, comparator_type comparator) { - auto N = v.extent(0); - size_t start = N / 2; - size_t end = N; - while (end > 1) { - if (start > 0) - start = start - 1; - else { - end = end - 1; - auto temp = v(0); - v(0) = v(end); - v(end) = temp; - } - size_t root = start; - while (2 * root + 1 < end) { - size_t child = 2 * root + 1; - if ((child + 1 < end) and (comparator(v(child), v(child + 1)))) - ++child; - - if (comparator(v(root), v(child))) { - auto temp = v(root); - v(root) = v(child); - v(child) = temp; - root = child; - } else - break; - } - } -} - /*! @class CutDropFunctor @brief Order each row by a criterion, compare the ratio of values and drop all entries once the ratio is below the threshold. @@ -499,7 +468,7 @@ class CutDropFunctor { for (size_t i = 0; i < nnz; ++i) { row_permutation(i) = i; } - serialHeapSort(row_permutation, comparator); + Misc::serialHeapSort(row_permutation, comparator); size_t keepStart = 0; size_t dropStart = nnz; diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_DroppingCommon.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_DroppingCommon.hpp index dd371c124fcd..1b45ff16d18d 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_DroppingCommon.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_DroppingCommon.hpp @@ -21,7 +21,7 @@ namespace MueLu { Once we are done with dropping, we should have no UNDECIDED entries left. Normally, both DROP and BOUNDARY entries will be dropped, but we distinguish them in case we want to keep boundaries. */ -enum DecisionType { +enum DecisionType : char { UNDECIDED = 0, // no decision has been taken yet, used for initialization KEEP = 1, // keeep the entry DROP = 2, // drop it @@ -402,6 +402,37 @@ class SymmetrizeFunctor { } }; +template +KOKKOS_INLINE_FUNCTION void serialHeapSort(view_type& v, comparator_type comparator) { + auto N = v.extent(0); + size_t start = N / 2; + size_t end = N; + while (end > 1) { + if (start > 0) + start = start - 1; + else { + end = end - 1; + auto temp = v(0); + v(0) = v(end); + v(end) = temp; + } + size_t root = start; + while (2 * root + 1 < end) { + size_t child = 2 * root + 1; + if ((child + 1 < end) and (comparator(v(child), v(child + 1)))) + ++child; + + if (comparator(v(root), v(child))) { + auto temp = v(root); + v(root) = v(child); + v(child) = temp; + root = child; + } else + break; + } + } +} + } // namespace Misc } // namespace MueLu diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_MatrixConstruction.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_MatrixConstruction.hpp index 1a5f2729c72e..ead5b407e2f8 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_MatrixConstruction.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_MatrixConstruction.hpp @@ -360,6 +360,55 @@ class PointwiseFillNoReuseFunctor { } }; +template +class BlockRowComparison { + public: + using local_ordinal_type = typename local_matrix_type::ordinal_type; + using memory_space = typename local_matrix_type::memory_space; + using block_indices_view_type = Kokkos::View; + + local_matrix_type A; + local_ordinal_type bsize; + block_indices_view_type ghosted_point_to_block; + + public: + BlockRowComparison(local_matrix_type& A_, local_ordinal_type bsize_, block_indices_view_type ghosted_point_to_block_) + : A(A_) + , bsize(bsize_) + , ghosted_point_to_block(ghosted_point_to_block_) {} + + template + struct Comparator { + private: + using local_ordinal_type = typename local_matrix_type2::ordinal_type; + using memory_space = typename local_matrix_type2::memory_space; + using block_indices_view_type = Kokkos::View; + + const local_matrix_type2 A; + const local_ordinal_type offset; + const block_indices_view_type ghosted_point_to_block; + + public: + KOKKOS_INLINE_FUNCTION + Comparator(const local_matrix_type2& A_, local_ordinal_type bsize_, local_ordinal_type brlid_, block_indices_view_type ghosted_point_to_block_) + : A(A_) + , offset(A_.graph.row_map(bsize_ * brlid_)) + , ghosted_point_to_block(ghosted_point_to_block_) {} + + KOKKOS_INLINE_FUNCTION + bool operator()(size_t x, size_t y) const { + return ghosted_point_to_block(A.graph.entries(offset + x)) < ghosted_point_to_block(A.graph.entries(offset + y)); + } + }; + + using comparator_type = Comparator; + + KOKKOS_INLINE_FUNCTION + comparator_type getComparator(local_ordinal_type brlid) const { + return comparator_type(A, bsize, brlid, ghosted_point_to_block); + } +}; + /*! @class VectorCountingFunctor @brief Functor that executes a sequence of sub-functors on each block of rows. @@ -380,6 +429,7 @@ class VectorCountingFunctor { using memory_space = typename local_matrix_type::memory_space; using results_view = Kokkos::View; using block_indices_view_type = Kokkos::View; + using permutation_type = Kokkos::View; using rowptr_type = typename local_matrix_type::row_map_type::non_const_type; using ATS = Kokkos::ArithTraits; @@ -392,6 +442,10 @@ class VectorCountingFunctor { rowptr_type graph_rowptr; functor_type functor; + + BlockRowComparison comparison; + permutation_type permutation; + VectorCountingFunctor remainingFunctors; std::vector functorNames; @@ -405,7 +459,9 @@ class VectorCountingFunctor { , filtered_rowptr(filtered_rowptr_) , graph_rowptr(graph_rowptr_) , functor(functor_) + , comparison(BlockRowComparison(A, blockSize_, ghosted_point_to_block)) , remainingFunctors(A_, blockSize_, ghosted_point_to_block_, results_, filtered_rowptr_, graph_rowptr_, remainingFunctors_...) { + permutation = permutation_type("permutation", A.nnz()); #ifdef MUELU_COALESCE_DROP_DEBUG std::string mangledFunctorName = typeid(decltype(functor)).name(); int status = 0; @@ -495,40 +551,41 @@ class VectorCountingFunctor { Kokkos::printf("Done with block row %d\nGraph indices ", brlid); #endif - local_ordinal_type* nextIndices = new local_ordinal_type[blockSize]; - for (local_ordinal_type block_index = 0; block_index < blockSize; ++block_index) { - nextIndices[block_index] = 0; - } + // column lids for all rows in the block + auto block_clids = Kokkos::subview(A.graph.entries, Kokkos::make_pair(A.graph.row_map(blockSize * brlid), + A.graph.row_map(blockSize * (brlid + 1)))); + // set up a permutatation index + auto block_permutation = Kokkos::subview(permutation, Kokkos::make_pair(A.graph.row_map(blockSize * brlid), + A.graph.row_map(blockSize * (brlid + 1)))); + for (size_t i = 0; i < block_permutation.extent(0); ++i) + block_permutation(i) = i; + // get permuatation for sorted column indices of the entire block + auto comparator = comparison.getComparator(brlid); + Misc::serialHeapSort(block_permutation, comparator); + local_ordinal_type prev_bclid = -1; - while (true) { - local_ordinal_type min_block_index = -1; - local_ordinal_type min_clid = ATS::max(); - local_ordinal_type min_offset = -1; - for (local_ordinal_type block_index = 0; block_index < blockSize; ++block_index) { - auto rlid = blockSize * brlid + block_index; - auto offset = A.graph.row_map(rlid) + nextIndices[block_index]; - if (offset == A.graph.row_map(rlid + 1)) - continue; - auto clid = A.graph.entries(offset); - if (clid < min_clid) { - min_block_index = block_index; - min_clid = clid; - min_offset = offset; - } - } - if (min_block_index == -1) - break; - ++nextIndices[min_block_index]; - auto bclid = ghosted_point_to_block(min_clid); - if (prev_bclid < bclid) { - if (results(min_offset) == KEEP) { - ++(*nnz_graph); + bool alreadyAdded = false; + + // loop over all sorted entries in block + auto offset = A.graph.row_map(blockSize * brlid); + for (size_t i = 0; i < block_permutation.extent(0); ++i) { + auto idx = offset + block_permutation(i); + auto clid = A.graph.entries(idx); + auto bclid = ghosted_point_to_block(clid); + + // unseen block column index + if (bclid > prev_bclid) + alreadyAdded = false; + + // add entry to graph + if (!alreadyAdded && (results(idx) == KEEP)) { + ++(*nnz_graph); + alreadyAdded = true; #ifdef MUELU_COALESCE_DROP_DEBUG - Kokkos::printf("%5d ", bclid); + Kokkos::printf("%5d ", bclid); #endif - prev_bclid = bclid; - } } + prev_bclid = bclid; } #ifdef MUELU_COALESCE_DROP_DEBUG Kokkos::printf("\n"); @@ -547,6 +604,7 @@ class VectorCountingFunctor { using memory_space = typename local_matrix_type::memory_space; using results_view = Kokkos::View; using block_indices_view_type = Kokkos::View; + using permutation_type = Kokkos::View; using rowptr_type = typename local_matrix_type::row_map_type::non_const_type; using ATS = Kokkos::ArithTraits; @@ -563,6 +621,9 @@ class VectorCountingFunctor { std::vector functorNames; + BlockRowComparison comparison; + permutation_type permutation; + public: VectorCountingFunctor(local_matrix_type& A_, local_ordinal_type blockSize_, block_indices_view_type ghosted_point_to_block_, results_view& results_, rowptr_type& filtered_rowptr_, rowptr_type& graph_rowptr_, functor_type& functor_) : A(A_) @@ -571,7 +632,9 @@ class VectorCountingFunctor { , results(results_) , filtered_rowptr(filtered_rowptr_) , graph_rowptr(graph_rowptr_) - , functor(functor_) { + , functor(functor_) + , comparison(BlockRowComparison(A, blockSize_, ghosted_point_to_block)) { + permutation = permutation_type("permutation", A.nnz()); #ifdef MUELU_COALESCE_DROP_DEBUG std::string mangledFunctorName = typeid(decltype(functor)).name(); int status = 0; @@ -659,40 +722,41 @@ class VectorCountingFunctor { Kokkos::printf("Done with block row %d\nGraph indices ", brlid); #endif - local_ordinal_type* nextIndices = new local_ordinal_type[blockSize]; - for (local_ordinal_type block_index = 0; block_index < blockSize; ++block_index) { - nextIndices[block_index] = 0; - } + // column lids for all rows in the block + auto block_clids = Kokkos::subview(A.graph.entries, Kokkos::make_pair(A.graph.row_map(blockSize * brlid), + A.graph.row_map(blockSize * (brlid + 1)))); + // set up a permutation index + auto block_permutation = Kokkos::subview(permutation, Kokkos::make_pair(A.graph.row_map(blockSize * brlid), + A.graph.row_map(blockSize * (brlid + 1)))); + for (size_t i = 0; i < block_permutation.extent(0); ++i) + block_permutation(i) = i; + // get permutation for sorted column indices of the entire block + auto comparator = comparison.getComparator(brlid); + Misc::serialHeapSort(block_permutation, comparator); + local_ordinal_type prev_bclid = -1; - while (true) { - local_ordinal_type min_block_index = -1; - local_ordinal_type min_clid = ATS::max(); - local_ordinal_type min_offset = -1; - for (local_ordinal_type block_index = 0; block_index < blockSize; ++block_index) { - auto rlid = blockSize * brlid + block_index; - auto offset = A.graph.row_map(rlid) + nextIndices[block_index]; - if (offset == A.graph.row_map(rlid + 1)) - continue; - auto clid = A.graph.entries(offset); - if (clid < min_clid) { - min_block_index = block_index; - min_clid = clid; - min_offset = offset; - } - } - if (min_block_index == -1) - break; - ++nextIndices[min_block_index]; - auto bclid = ghosted_point_to_block(min_clid); - if (prev_bclid < bclid) { - if (results(min_offset) == KEEP) { - ++(*nnz_graph); + bool alreadyAdded = false; + + // loop over all sorted entries in block + auto offset = A.graph.row_map(blockSize * brlid); + for (size_t i = 0; i < block_permutation.extent(0); ++i) { + auto idx = offset + block_permutation(i); + auto clid = A.graph.entries(idx); + auto bclid = ghosted_point_to_block(clid); + + // unseen block column index + if (bclid > prev_bclid) + alreadyAdded = false; + + // add entry to graph + if (!alreadyAdded && (results(idx) == KEEP)) { + ++(*nnz_graph); + alreadyAdded = true; #ifdef MUELU_COALESCE_DROP_DEBUG - Kokkos::printf("%5d ", bclid); + Kokkos::printf("%5d ", bclid); #endif - prev_bclid = bclid; - } } + prev_bclid = bclid; } #ifdef MUELU_COALESCE_DROP_DEBUG Kokkos::printf("\n"); @@ -720,6 +784,7 @@ class VectorFillFunctor { using ATS = Kokkos::ArithTraits; using OTS = Kokkos::ArithTraits; using block_indices_view_type = Kokkos::View; + using permutation_type = Kokkos::View; local_matrix_type A; local_ordinal_type blockSize; @@ -729,6 +794,9 @@ class VectorFillFunctor { local_graph_type graph; const scalar_type zero = ATS::zero(); + BlockRowComparison comparison; + permutation_type permutation; + public: VectorFillFunctor(local_matrix_type& A_, local_ordinal_type blockSize_, block_indices_view_type ghosted_point_to_block_, results_view& results_, local_matrix_type& filteredA_, local_graph_type& graph_) : A(A_) @@ -736,7 +804,10 @@ class VectorFillFunctor { , ghosted_point_to_block(ghosted_point_to_block_) , results(results_) , filteredA(filteredA_) - , graph(graph_) {} + , graph(graph_) + , comparison(BlockRowComparison(A, blockSize_, ghosted_point_to_block)) { + permutation = permutation_type("permutation", A.nnz()); + } KOKKOS_INLINE_FUNCTION void operator()(const local_ordinal_type brlid) const { @@ -776,40 +847,40 @@ class VectorFillFunctor { } } - local_ordinal_type* nextIndices = new local_ordinal_type[blockSize]; - for (local_ordinal_type block_index = 0; block_index < blockSize; ++block_index) { - nextIndices[block_index] = 0; - } - local_ordinal_type prev_bclid = -1; + // column lids for all rows in the block + auto block_clids = Kokkos::subview(A.graph.entries, Kokkos::make_pair(A.graph.row_map(blockSize * brlid), + A.graph.row_map(blockSize * (brlid + 1)))); + // set up a permuatation index + auto block_permutation = Kokkos::subview(permutation, Kokkos::make_pair(A.graph.row_map(blockSize * brlid), + A.graph.row_map(blockSize * (brlid + 1)))); + for (size_t i = 0; i < block_permutation.extent(0); ++i) + block_permutation(i) = i; + // get permutation for sorted column indices of the entire block + auto comparator = comparison.getComparator(brlid); + Misc::serialHeapSort(block_permutation, comparator); - local_ordinal_type j = graph.row_map(brlid); - while (true) { - local_ordinal_type min_block_index = -1; - local_ordinal_type min_clid = OTS::max(); - local_ordinal_type min_offset = -1; - for (local_ordinal_type block_index = 0; block_index < blockSize; ++block_index) { - auto rlid = blockSize * brlid + block_index; - auto offset = A.graph.row_map(rlid) + nextIndices[block_index]; - if (offset == A.graph.row_map(rlid + 1)) - continue; - auto clid = A.graph.entries(offset); - if (clid < min_clid) { - min_block_index = block_index; - min_clid = clid; - min_offset = offset; - } - } - if (min_block_index == -1) - break; - ++nextIndices[min_block_index]; - auto bclid = ghosted_point_to_block(min_clid); - if (prev_bclid < bclid) { - if (results(min_offset) == KEEP) { - graph.entries(j) = bclid; - ++j; - prev_bclid = bclid; - } + local_ordinal_type prev_bclid = -1; + bool alreadyAdded = false; + local_ordinal_type j = graph.row_map(brlid); + + // loop over all sorted entries in block + auto offset = A.graph.row_map(blockSize * brlid); + for (size_t i = 0; i < block_permutation.extent(0); ++i) { + auto idx = offset + block_permutation(i); + auto clid = A.graph.entries(idx); + auto bclid = ghosted_point_to_block(clid); + + // unseen block column index + if (bclid > prev_bclid) + alreadyAdded = false; + + // add entry to graph + if (!alreadyAdded && (results(idx) == KEEP)) { + graph.entries(j) = bclid; + ++j; + alreadyAdded = true; } + prev_bclid = bclid; } } };