From 5930a2b92d425ce54870fc69631de6d66069d112 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Mon, 22 Aug 2022 22:57:08 -0400 Subject: [PATCH] Adding my weighted graph coarsening code into kokkos-kernels (#1043) * adding my wgted graph coarsening code into kokkos-kernels * graph coarsening unit tests * updating coarse builder with recent changes * updating with recent changes * fixing some details * more tests; small fixes * fix some problems * fix some things; unit tests work now * change contact * fixed for pthreads and when openmp/serial are not enabled * enable spgemm dedupe type in unit test * clang format * clang format again * converting all member functions to static; using a handle instead of member variable parameters * clang-format * requested changes * exclude new coarsening files from compilation for cuda builds without lambdas enabled * update files after rebase * clang format * fix g++ compiler warnings for serial build * clang format * 'correct' variable was not needed and has been eliminated * fix compiler warning for SKX gcc-7 build * fix randomly failing coarsening test for small random graphs --- .../KokkosKernels_HashmapAccumulator.hpp | 115 +- src/graph/KokkosGraph_CoarsenConstruct.hpp | 1919 +++++++++++++++++ src/graph/KokkosGraph_CoarsenHeuristics.hpp | 1191 ++++++++++ unit_test/graph/Test_Graph.hpp | 1 + unit_test/graph/Test_Graph_coarsen.hpp | 481 +++++ 5 files changed, 3705 insertions(+), 2 deletions(-) create mode 100644 src/graph/KokkosGraph_CoarsenConstruct.hpp create mode 100644 src/graph/KokkosGraph_CoarsenHeuristics.hpp create mode 100644 unit_test/graph/Test_Graph_coarsen.hpp diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp index c6397fd9ea..c988e2f4f7 100644 --- a/src/common/KokkosKernels_HashmapAccumulator.hpp +++ b/src/common/KokkosKernels_HashmapAccumulator.hpp @@ -46,8 +46,6 @@ #include #include -//#define HASHMAPACCUMULATOR_ASSERT_ENABLED - namespace KokkosKernels { namespace Experimental { @@ -442,6 +440,77 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA because warps do not go in SIMD fashion + // anymore. while some thread might insert my_write_index into linked + // list, another thread in the warp might be reading keys in above loop. + // before inserting the new value in liked list -- which is done with + // atomic exchange below, we make sure that the linked is is complete my + // assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, because no two keys will be inserted into + // hashmap at the same time, as rows have unique columns. + + // Neither the compiler nor the execution unit can re-order the line + // directly below with the next line performing the atomic_exchange as the + // atomic exchange writes to hash_begins[hash] and this line reads from + // hash_begins[hash]. + // This line is needed such that threads of execution can still access the + // old linked list, after hash_begins+hash has been atomically overwritten + // with my_write_index but before hash_nexts[my_write_index] is + // overwritten with hashbeginning. If this line was not here, threads may + // not be able to access the dangling linked list since + // hash_nexts[my_write_index] would still be -1. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + if (hashbeginning == -1) { + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = + hash; + } + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // just like vector_atomic_insert_into_hash_mergeAdd_TrackHashes + // except uses atomic addition on updating the value + // necessary if duplicate key insertions happen simultaneously + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( + const key_type key, const value_type value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + if (hash != -1) { + i = hash_begins[hash]; + + for (; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + Kokkos::atomic_add(values + i, value); + return __insert_success; + } + } + } else { + return __insert_success; + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + values[my_write_index] = value; + #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ defined(KOKKOS_ARCH_AMPERE) // this is an issue on VOLTA and up because warps do not go in SIMD @@ -480,6 +549,48 @@ struct HashmapAccumulator { } } + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes_no_list( + const key_type key, const value_type value, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + if (hash != -1) { + // loop until an empty hash is found and the key insertion succeeds + // if our capacity is at least some constant multiple of the current used + // hashes then the expected number of iterations is constant + int depth = 0; + // add key to hash to ensure no two keys follow the same paths over hashes + // add depth to prevent cycles + for (;; hash = __compute_hash(hash + key + depth++, __hashOpRHS)) { + if (keys[hash] == key) { + Kokkos::atomic_add(values + hash, value); + return __insert_success; + } else if (keys[hash] == -1) { + if (Kokkos::atomic_compare_exchange_strong(keys + hash, -1, + key)) { + // should only be here if we used a new hash + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, + size_type(1))] = hash; + Kokkos::atomic_add(values + hash, value); + return __insert_success; + } + // we don't care if we failed if some other thread succeeded with the + // same key as ours + if (keys[hash] == key) { + Kokkos::atomic_add(values + hash, value); + return __insert_success; + } + } + } + } else { + return __insert_success; + } + } + // NOTE: this is an exact copy of vector_atmoic_insert_into_hash_mergeAdd from // https://github.com/kokkos/kokkos-kernels/blob/750fe24508a69ed4dba92bb4a9e17a6094b1a083/src/common/KokkosKernels_HashmapAccumulator.hpp#L442-L502 template diff --git a/src/graph/KokkosGraph_CoarsenConstruct.hpp b/src/graph/KokkosGraph_CoarsenConstruct.hpp new file mode 100644 index 0000000000..250f7873de --- /dev/null +++ b/src/graph/KokkosGraph_CoarsenConstruct.hpp @@ -0,0 +1,1919 @@ +#pragma once +// exclude from Cuda builds without lambdas enabled +#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#include +#include +#include +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spgemm.hpp" +#include "KokkosSparse_SortCrs.hpp" +#include "KokkosKernels_HashmapAccumulator.hpp" +#include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" +#include "KokkosGraph_CoarsenHeuristics.hpp" + +namespace KokkosSparse { + +namespace Impl { + +template +struct SortLowDegreeCrsMatrixFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + using team_mem = typename Kokkos::TeamPolicy::member_type; + using value_type = lno_t; + + SortLowDegreeCrsMatrixFunctor(bool usingRangePol, const rowmap_t& _rowmap, + const entries_t& _entries, + const values_t& _values, + const lno_t _degreeLimit) + : rowmap(_rowmap), + entries(_entries), + values(_values), + degreeLimit(_degreeLimit) { + if (usingRangePol) { + entriesAux = + entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), + entries.extent(0)); + valuesAux = + values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"), + values.extent(0)); + } + // otherwise, aux arrays won't be allocated (sorting in place) + } + + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, + value_type& reducer) const { + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + if (rowNum > degreeLimit) { + reducer++; + return; + } + // Radix sort requires unsigned keys for comparison + using unsigned_lno_t = typename std::make_unsigned::type; + KokkosKernels::SerialRadixSort2( + (unsigned_lno_t*)entries.data() + rowStart, + (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart, + valuesAux.data() + rowStart, rowNum); + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t, + value_type& reducer) const { + size_type i = t.league_rank(); + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + if (rowNum > degreeLimit) { + Kokkos::single(Kokkos::PerTeam(t), [&]() { reducer++; }); + return; + } + KokkosKernels::TeamBitonicSort2( + entries.data() + rowStart, values.data() + rowStart, rowNum, t); + } + + rowmap_t rowmap; + entries_t entries; + entries_t entriesAux; + values_t values; + values_t valuesAux; + lno_t degreeLimit; +}; + +} // namespace Impl + +// Sort a CRS matrix: within each row, sort entries ascending by column. +// At the same time, permute the values. +// Only modifies rows below the degreeLimit +template +typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix( + const rowmap_t& rowmap, const entries_t& entries, const values_t& values, + const typename entries_t::non_const_value_type degreeLimit) { + using lno_t = typename entries_t::non_const_value_type; + using team_pol = Kokkos::TeamPolicy; + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + Impl::SortLowDegreeCrsMatrixFunctor + funct(useRadix, rowmap, entries, values, degreeLimit); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + lno_t notSorted = 0; + if (useRadix) { + Kokkos::parallel_reduce("sort_crs_matrix", + Kokkos::RangePolicy(0, numRows), + funct, notSorted); + } else { + // Try to get teamsize to be largest power of 2 not greater than avg entries + // per row + // TODO (probably important for performnce): add thread-level sort also, and + // use that for small avg degree. But this works for now. probably important + // for this particular use case of only low-degree rows + int teamSize = 1; + lno_t avgDeg = 0; + if (numRows) avgDeg = (entries.extent(0) + numRows - 1) / numRows; + if (avgDeg > degreeLimit) { + avgDeg = degreeLimit; + } + while (teamSize * 2 * 2 <= avgDeg) { + teamSize *= 2; + } + team_pol temp(numRows, teamSize); + teamSize = std::min(teamSize, + temp.team_size_max(funct, Kokkos::ParallelReduceTag())); + Kokkos::parallel_reduce("sort_crs_matrix", team_pol(numRows, teamSize), + funct, notSorted); + } + return notSorted; +} + +} // namespace KokkosSparse + +namespace KokkosGraph { + +namespace Experimental { + +// this class is not meant to be instantiated +// think of it like a templated namespace +template +class coarse_builder { + public: + // define internal types + using matrix_t = crsMat; + using exec_space = typename matrix_t::execution_space; + using mem_space = typename matrix_t::memory_space; + using Device = typename matrix_t::device_type; + using ordinal_t = typename matrix_t::ordinal_type; + using edge_offset_t = typename matrix_t::size_type; + using scalar_t = typename matrix_t::value_type; + using vtx_view_t = Kokkos::View; + using wgt_view_t = Kokkos::View; + using edge_view_t = Kokkos::View; + using edge_subview_t = Kokkos::View; + using graph_type = typename matrix_t::staticcrsgraph_type; + using policy_t = Kokkos::RangePolicy; + using dyn_policy_t = + Kokkos::RangePolicy, exec_space>; + using team_policy_t = Kokkos::TeamPolicy; + using dyn_team_policy_t = + Kokkos::TeamPolicy, exec_space>; + using member = typename team_policy_t::member_type; + using spgemm_kernel_handle = KokkosKernels::Experimental::KokkosKernelsHandle< + edge_offset_t, ordinal_t, scalar_t, exec_space, mem_space, mem_space>; + using uniform_memory_pool_t = + KokkosKernels::Impl::UniformMemoryPool; + using mapper_t = coarsen_heuristics; + static constexpr ordinal_t get_null_val() { + // this value must line up with the null value used by the hashmap + // accumulator + if (std::is_signed::value) { + return -1; + } else { + return std::numeric_limits::max(); + } + } + static constexpr ordinal_t ORD_MAX = get_null_val(); + static constexpr bool is_host_space = std::is_same< + typename exec_space::memory_space, + typename Kokkos::DefaultHostExecutionSpace::memory_space>::value; + static constexpr bool scal_eq_ord = std::is_same::value; + // contains matrix and vertex weights corresponding to current level + // interp matrix maps previous level to this level + struct coarse_level_triple { + matrix_t mtx; + vtx_view_t vtx_wgts; + matrix_t interp_mtx; + int level; + bool uniform_weights; + }; + + // define behavior-controlling enums + enum Heuristic { HECv1, Match, MtMetis, MIS2, GOSHv1, GOSHv2 }; + enum Builder { Sort, Hashmap, Hybrid, Spgemm, Spgemm_transpose_first }; + + struct coarsen_handle { + // internal parameters and data + // default heuristic is HEC + Heuristic h = HECv1; + // default builder is Hybrid + Builder b = Hybrid; + std::list results; + ordinal_t coarse_vtx_cutoff = 50; + ordinal_t min_allowed_vtx = 10; + unsigned int max_levels = 200; + size_t max_mem_allowed = 536870912; + }; + + // determine if dynamic scheduling should be used + static bool should_use_dyn( + const ordinal_t n, const Kokkos::View work, + int t_count) { + bool use_dyn = false; + edge_offset_t max = 0; + edge_offset_t min = std::numeric_limits::max(); + if (is_host_space) { + ordinal_t static_size = (n + t_count) / t_count; + for (ordinal_t i = 0; i < t_count; i++) { + ordinal_t start = i * static_size; + ordinal_t end = start + static_size; + if (start > n) start = n; + if (end > n) end = n; + edge_offset_t size = work(end) - work(start); + if (size > max) { + max = size; + } + if (size < min) { + min = size; + } + } + if (n > 500000 && max > 5 * min) { + use_dyn = true; + } + } + return use_dyn; + } + + // build the course graph according to ((B^T A) B) or (B^T (A B)), where B is + // aggregator matrix + static coarse_level_triple build_coarse_graph_spgemm( + coarsen_handle& handle, const coarse_level_triple level, + const matrix_t interp_mtx) { + vtx_view_t f_vtx_w = level.vtx_wgts; + matrix_t g = level.mtx; + + ordinal_t n = g.numRows(); + ordinal_t nc = interp_mtx.numCols(); + + matrix_t interp_transpose = + KokkosSparse::Impl::transpose_matrix(interp_mtx); + + spgemm_kernel_handle kh; + kh.set_team_work_size(64); + kh.set_dynamic_scheduling(true); + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = + KokkosSparse::SPGEMM_KK_MEMORY; + kh.create_spgemm_handle(spgemm_algorithm); + + vtx_view_t adj_coarse; + wgt_view_t wgt_coarse; + edge_view_t row_map_coarse; + + if (handle.b == Spgemm_transpose_first) { + edge_view_t row_map_p1("rows_partial", nc + 1); + KokkosSparse::Experimental::spgemm_symbolic( + &kh, nc, n, n, interp_transpose.graph.row_map, + interp_transpose.graph.entries, false, g.graph.row_map, + g.graph.entries, false, row_map_p1); + + // partial-result matrix + vtx_view_t entries_p1("adjacencies_partial", + kh.get_spgemm_handle()->get_c_nnz()); + wgt_view_t values_p1("weights_partial", + kh.get_spgemm_handle()->get_c_nnz()); + + KokkosSparse::Experimental::spgemm_numeric( + &kh, nc, n, n, interp_transpose.graph.row_map, + interp_transpose.graph.entries, interp_transpose.values, false, + g.graph.row_map, g.graph.entries, g.values, false, row_map_p1, + entries_p1, values_p1); + + row_map_coarse = edge_view_t("rows_coarse", nc + 1); + KokkosSparse::Experimental::spgemm_symbolic( + &kh, nc, n, nc, row_map_p1, entries_p1, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, false, + row_map_coarse); + // coarse-graph adjacency matrix + adj_coarse = + vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); + wgt_coarse = + wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); + + KokkosSparse::Experimental::spgemm_numeric( + &kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, + false, row_map_coarse, adj_coarse, wgt_coarse); + } else { + edge_view_t row_map_p1("rows_partial", n + 1); + KokkosSparse::Experimental::spgemm_symbolic( + &kh, n, n, nc, g.graph.row_map, g.graph.entries, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, false, + row_map_p1); + + // partial-result matrix + vtx_view_t entries_p1("adjacencies_partial", + kh.get_spgemm_handle()->get_c_nnz()); + wgt_view_t values_p1("weights_partial", + kh.get_spgemm_handle()->get_c_nnz()); + + KokkosSparse::Experimental::spgemm_numeric( + &kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false, + interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values, + false, row_map_p1, entries_p1, values_p1); + + row_map_coarse = edge_view_t("rows_coarse", nc + 1); + KokkosSparse::Experimental::spgemm_symbolic( + &kh, nc, n, nc, interp_transpose.graph.row_map, + interp_transpose.graph.entries, false, row_map_p1, entries_p1, false, + row_map_coarse); + // coarse-graph adjacency matrix + adj_coarse = + vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz()); + wgt_coarse = + wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz()); + + KokkosSparse::Experimental::spgemm_numeric( + &kh, nc, n, nc, interp_transpose.graph.row_map, + interp_transpose.graph.entries, interp_transpose.values, false, + row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse, + wgt_coarse); + } + + // now we must remove self-loop edges + edge_view_t nonLoops("nonLoop", nc); + + // gonna reuse this to count non-self loop edges + Kokkos::parallel_for( + policy_t(0, nc), KOKKOS_LAMBDA(ordinal_t i) { nonLoops(i) = 0; }); + + Kokkos::parallel_for( + policy_t(0, nc), KOKKOS_LAMBDA(ordinal_t u) { + for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); + j++) { + if (adj_coarse(j) != u) { + nonLoops(u)++; + } + } + }); + + edge_view_t row_map_nonloop("nonloop row map", nc + 1); + + Kokkos::parallel_scan( + policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& update, + const bool final) { + const edge_offset_t val_i = nonLoops(i); + update += val_i; + if (final) { + row_map_nonloop(i + 1) = update; + } + }); + + edge_subview_t rmn_subview = Kokkos::subview(row_map_nonloop, nc); + edge_offset_t rmn = 0; + Kokkos::deep_copy(rmn, rmn_subview); + + vtx_view_t entries_nonloop("nonloop entries", rmn); + wgt_view_t values_nonloop("nonloop values", rmn); + + Kokkos::parallel_for( + policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i) { nonLoops(i) = 0; }); + + Kokkos::parallel_for( + policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t u) { + for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1); + j++) { + if (adj_coarse(j) != u) { + edge_offset_t offset = row_map_nonloop(u) + nonLoops(u)++; + entries_nonloop(offset) = adj_coarse(j); + values_nonloop(offset) = wgt_coarse(j); + } + } + }); + // done removing self-loop edges + + kh.destroy_spgemm_handle(); + + graph_type gc_graph(entries_nonloop, row_map_nonloop); + matrix_t gc("gc", nc, values_nonloop, gc_graph); + + vtx_view_t c_vtx_w("coarse vtx weights", interp_mtx.numCols()); + Kokkos::parallel_for( + "compute coarse vtx wgts", policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i) { + ordinal_t u = interp_mtx.graph.entries(i); + Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i)); + }); + + coarse_level_triple next_level; + next_level.mtx = gc; + next_level.vtx_wgts = c_vtx_w; + next_level.level = level.level + 1; + next_level.interp_mtx = interp_mtx; + next_level.uniform_weights = false; + return next_level; + } + + struct prefix_sum { + vtx_view_t input; + edge_view_t output; + + prefix_sum(vtx_view_t _input, edge_view_t _output) + : input(_input), output(_output) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_t i, edge_offset_t& update, + const bool final) const { + const edge_offset_t val_i = input(i); + update += val_i; + if (final) { + output(i + 1) = update; + } + } + }; + + struct functorDedupeLowDegreeAfterSort { + // compiler may get confused what the reduction type is without this + typedef edge_offset_t value_type; + + edge_view_t row_map; + vtx_view_t entries, entriesOut; + wgt_view_t wgts, wgtsOut; + vtx_view_t dedupe_edge_count; + ordinal_t degreeLimit; + + functorDedupeLowDegreeAfterSort(edge_view_t _row_map, vtx_view_t _entries, + vtx_view_t _entriesOut, wgt_view_t _wgts, + wgt_view_t _wgtsOut, + vtx_view_t _dedupe_edge_count, + ordinal_t _degreeLimit_) + : row_map(_row_map), + entries(_entries), + entriesOut(_entriesOut), + wgts(_wgts), + wgtsOut(_wgtsOut), + dedupe_edge_count(_dedupe_edge_count), + degreeLimit(_degreeLimit_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member& thread, edge_offset_t& thread_sum) const { + ordinal_t u = thread.league_rank(); + edge_offset_t start = row_map(u); + edge_offset_t end = row_map(u + 1); + ordinal_t degree = end - start; + if (degree > degreeLimit) { + return; + } + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(thread, start, end), + [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { + if (i == start) { + update += 1; + } else if (entries(i) != entries(i - 1)) { + update += 1; + } + if (final) { + entriesOut(start + update - 1) = entries(i); + // requires that wgtsOut be initialized to 0 + Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); + if (i + 1 == end) { + dedupe_edge_count(u) = update; + } + } + }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, start, start + dedupe_edge_count(u)), + [&](const edge_offset_t& i) { + entries(i) = entriesOut(i); + wgts(i) = wgtsOut(i); + }); + Kokkos::single(Kokkos::PerTeam(thread), + [&]() { thread_sum += dedupe_edge_count(u); }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_t& u, edge_offset_t& thread_sum) const { + ordinal_t offset = row_map(u); + ordinal_t last = ORD_MAX; + ordinal_t degree = row_map(u + 1) - row_map(u); + if (degree > degreeLimit) { + return; + } + for (edge_offset_t i = row_map(u); i < row_map(u + 1); i++) { + if (last != entries(i)) { + entriesOut(offset) = entries(i); + wgtsOut(offset) = wgts(i); + last = entries(offset); + offset++; + } else { + wgtsOut(offset - 1) += wgts(i); + } + } + dedupe_edge_count(u) = offset - row_map(u); + thread_sum += offset - row_map(u); + } + }; + + struct functorDedupeAfterSort { + // compiler may get confused what the reduction type is without this + typedef edge_offset_t value_type; + + edge_view_t row_map; + vtx_view_t entries, entriesOut; + wgt_view_t wgts, wgtsOut; + vtx_view_t dedupe_edge_count; + + functorDedupeAfterSort(edge_view_t _row_map, vtx_view_t _entries, + vtx_view_t _entriesOut, wgt_view_t _wgts, + wgt_view_t _wgtsOut, vtx_view_t _dedupe_edge_count) + : row_map(_row_map), + entries(_entries), + entriesOut(_entriesOut), + wgts(_wgts), + wgtsOut(_wgtsOut), + dedupe_edge_count(_dedupe_edge_count) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member& thread, edge_offset_t& thread_sum) const { + ordinal_t u = thread.league_rank(); + edge_offset_t start = row_map(u); + edge_offset_t end = row_map(u + 1); + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(thread, start, end), + [&](const edge_offset_t& i, edge_offset_t& update, const bool final) { + if (i == start) { + update += 1; + } else if (entries(i) != entries(i - 1)) { + update += 1; + } + if (final) { + entriesOut(start + update - 1) = entries(i); + // requires that wgtsOut be initialized to 0 + Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i)); + if (i + 1 == end) { + dedupe_edge_count(u) = update; + } + } + }); + Kokkos::single(Kokkos::PerTeam(thread), + [&]() { thread_sum += dedupe_edge_count(u); }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_t& u, edge_offset_t& thread_sum) const { + ordinal_t offset = row_map(u); + ordinal_t last = ORD_MAX; + for (edge_offset_t i = row_map(u); i < row_map(u + 1); i++) { + if (last != entries(i)) { + entries(offset) = entries(i); + wgtsOut(offset) = wgts(i); + last = entries(offset); + offset++; + } else { + wgtsOut(offset - 1) += wgts(i); + } + } + dedupe_edge_count(u) = offset - row_map(u); + thread_sum += offset - row_map(u); + } + }; + + struct functorCollapseDirectedToUndirected { + const edge_view_t source_row_map; + const edge_view_t target_row_map; + const vtx_view_t source_edge_counts; + vtx_view_t target_edge_counts; + const vtx_view_t source_destinations; + vtx_view_t target_destinations; + const wgt_view_t source_wgts; + wgt_view_t target_wgts; + + functorCollapseDirectedToUndirected( + const edge_view_t _source_row_map, const edge_view_t _target_row_map, + const vtx_view_t _source_edge_counts, vtx_view_t _target_edge_counts, + const vtx_view_t _source_destinations, vtx_view_t _target_destinations, + const wgt_view_t _source_wgts, wgt_view_t _target_wgts) + : source_row_map(_source_row_map), + target_row_map(_target_row_map), + source_edge_counts(_source_edge_counts), + target_edge_counts(_target_edge_counts), + source_destinations(_source_destinations), + target_destinations(_target_destinations), + source_wgts(_source_wgts), + target_wgts(_target_wgts) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member& thread) const { + ordinal_t u = thread.league_rank(); + edge_offset_t u_origin = source_row_map(u); + edge_offset_t u_dest_offset = target_row_map(u); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, source_edge_counts(u)), + [=](const edge_offset_t u_idx) { + ordinal_t v = source_destinations(u_origin + u_idx); + scalar_t wgt = source_wgts(u_origin + u_idx); + edge_offset_t v_dest_offset = target_row_map(v); + edge_offset_t v_dest = + v_dest_offset + + Kokkos::atomic_fetch_add(&target_edge_counts(v), 1); + edge_offset_t u_dest = + u_dest_offset + + Kokkos::atomic_fetch_add(&target_edge_counts(u), 1); + + target_destinations(u_dest) = v; + target_wgts(u_dest) = wgt; + target_destinations(v_dest) = u; + target_wgts(v_dest) = wgt; + }); + } + }; + + struct functorHashmapAccumulator { + // compiler may get confused what the reduction type is without this + typedef ordinal_t value_type; + edge_view_t row_map; + vtx_view_t entries_in, entries_out; + wgt_view_t wgts_in, wgts_out; + vtx_view_t dedupe_edge_count; + uniform_memory_pool_t memory_pool; + const ordinal_t hash_size; + const ordinal_t max_hash_entries; + vtx_view_t remaining; + bool use_out; + + functorHashmapAccumulator(edge_view_t _row_map, vtx_view_t _entries_in, + vtx_view_t _entries_out, wgt_view_t _wgts_in, + wgt_view_t _wgts_out, + vtx_view_t _dedupe_edge_count, + uniform_memory_pool_t _memory_pool, + const ordinal_t _hash_size, + const ordinal_t _max_hash_entries, + vtx_view_t _remaining, bool _use_out) + : row_map(_row_map), + entries_in(_entries_in), + entries_out(_entries_out), + wgts_in(_wgts_in), + wgts_out(_wgts_out), + dedupe_edge_count(_dedupe_edge_count), + memory_pool(_memory_pool), + hash_size(_hash_size), + max_hash_entries(_max_hash_entries), + remaining(_remaining), + use_out(_use_out) {} + + KOKKOS_INLINE_FUNCTION + ordinal_t get_thread_id(const ordinal_t row_index) const { +#if defined(KOKKOS_ENABLE_SERIAL) + if (std::is_same::value) return 0; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + if (std::is_same::value) + return Kokkos::OpenMP::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + if (std::is_same::value) + return Kokkos::Threads::impl_hardware_thread_id(); +#endif + return row_index; + } + + // reduces to find total number of rows that were too large + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_t& rem_idx, ordinal_t& thread_sum) const { + ordinal_t idx = remaining(rem_idx); + typedef ordinal_t hash_size_type; + typedef ordinal_t hash_key_type; + typedef scalar_t hash_value_type; + + // can't do this row at current hashmap size + ordinal_t hash_entries = row_map(idx + 1) - row_map(idx); + if (hash_entries >= max_hash_entries) { + thread_sum++; + return; + } + volatile ordinal_t* ptr_temp = nullptr; + ordinal_t t_id = rem_idx; + // need to use the hardware thread id if the pool type is + // OneThread2OneChunk + t_id = get_thread_id(t_id); + while (nullptr == ptr_temp) { + ptr_temp = (volatile ordinal_t*)(memory_pool.allocate_chunk(t_id)); + } + if (ptr_temp == nullptr) { + return; + } + ordinal_t* ptr_memory_pool_chunk = (ordinal_t*)(ptr_temp); + + // These are updated by Hashmap_Accumulator insert functions. + ordinal_t* used_hash_size = (ordinal_t*)(ptr_temp); + ptr_temp++; + ordinal_t* used_hash_count = (ordinal_t*)(ptr_temp); + ptr_temp++; + *used_hash_size = 0; + *used_hash_count = 0; + + // hash function is hash_size-1 (note: hash_size must be a power of 2) + ordinal_t hash_func_pow2 = hash_size - 1; + + // Set pointer to hash indices + ordinal_t* used_hash_indices = (ordinal_t*)(ptr_temp); + ptr_temp += hash_size; + + // Set pointer to hash begins + ordinal_t* hash_begins = (ordinal_t*)(ptr_temp); + ptr_temp += hash_size; + + // Set pointer to hash nexts + ordinal_t* hash_nexts = (ordinal_t*)(ptr_temp); + + // Set pointer to hash keys + ordinal_t* keys = (ordinal_t*)entries_out.data() + row_map(idx); + + // Set pointer to hash values + scalar_t* values = (scalar_t*)wgts_out.data() + row_map(idx); + + KokkosKernels::Experimental::HashmapAccumulator< + hash_size_type, hash_key_type, hash_value_type, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, + values); + + for (edge_offset_t i = row_map(idx); i < row_map(idx + 1); i++) { + ordinal_t key = entries_in(i); + scalar_t value = wgts_in(i); + hash_map.sequential_insert_into_hash_mergeAdd_TrackHashes( + key, value, used_hash_size, used_hash_count, used_hash_indices); + }; + + // Reset the Begins values to -1 before releasing the memory pool chunk. + // If you don't do this the next thread that grabs this memory chunk will + // not work properly. + for (ordinal_t i = 0; i < *used_hash_count; i++) { + ordinal_t dirty_hash = used_hash_indices[i]; + + hash_map.hash_begins[dirty_hash] = ORD_MAX; + }; + + // used_hash_size gives the number of entries, used_hash_count gives the + // number of dirty hash values (I think) + dedupe_edge_count(idx) = *used_hash_size; + // Release the memory pool chunk back to the pool + memory_pool.release_chunk(ptr_memory_pool_chunk); + + } // operator() + + // reduces to find total number of rows that were too large + KOKKOS_INLINE_FUNCTION + void operator()(const member& thread, ordinal_t& thread_sum) const { + ordinal_t idx = remaining(thread.league_rank()); + typedef ordinal_t hash_size_type; + typedef ordinal_t hash_key_type; + typedef scalar_t hash_value_type; + + // can't do this row at current hashmap size + ordinal_t hash_entries = row_map(idx + 1) - row_map(idx); + if (hash_entries >= max_hash_entries) { + Kokkos::single(Kokkos::PerTeam(thread), [&]() { thread_sum++; }); + thread.team_barrier(); + return; + } + volatile ordinal_t* ptr_temp = nullptr; + Kokkos::single( + Kokkos::PerTeam(thread), + [=](volatile ordinal_t*& ptr_write) { + // Acquire a chunk from the memory pool using a spin-loop. + ptr_write = nullptr; + while (nullptr == ptr_write) { + ptr_write = (volatile ordinal_t*)(memory_pool.allocate_chunk( + thread.league_rank())); + } + }, + ptr_temp); + thread.team_barrier(); + if (ptr_temp == nullptr) { + return; + } + ordinal_t* ptr_memory_pool_chunk = (ordinal_t*)(ptr_temp); + + // These are updated by Hashmap_Accumulator insert functions. + ordinal_t* used_hash_size = (ordinal_t*)(ptr_temp); + ptr_temp++; + ordinal_t* used_hash_count = (ordinal_t*)(ptr_temp); + ptr_temp++; + ordinal_t* write_idx = (ordinal_t*)(ptr_temp); + ptr_temp++; + Kokkos::single(Kokkos::PerTeam(thread), [&]() { + *used_hash_size = 0; + *used_hash_count = 0; + *write_idx = 0; + }); + + // hash function is hash_size-1 (note: hash_size must be a power of 2) + ordinal_t hash_func_pow2 = hash_size - 1; + + // Set pointer to hash indices + ordinal_t* used_hash_indices = (ordinal_t*)(ptr_temp); + ptr_temp += hash_size; + + // Set pointer to hash begins + ordinal_t* hash_begins = (ordinal_t*)(ptr_temp); + ptr_temp += hash_size; + + // Set pointer to hash nexts + ordinal_t* hash_nexts = (ordinal_t*)(ptr_temp); + ptr_temp += max_hash_entries; + + // Set pointer to hash keys + ordinal_t* keys = (ordinal_t*)(ptr_temp); + ptr_temp += max_hash_entries; + + // Set pointer to hash values + scalar_t* values; + if (use_out) { + values = (scalar_t*)wgts_out.data() + row_map(idx); + } else { + values = (scalar_t*)(ptr_temp); + } + + KokkosKernels::Experimental::HashmapAccumulator< + hash_size_type, hash_key_type, hash_value_type, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys, + values); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(thread, row_map(idx), row_map(idx + 1)), + [&](const edge_offset_t& i) { + ordinal_t key = entries_in(i); + scalar_t value = wgts_in(i); + // duplicate keys may be inserted simultaneously, this causes + // problems we must handle later + int r = + hash_map + .vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes( + key, value, used_hash_size, used_hash_count, + used_hash_indices); + + // Check return code + if (r) { + } + }); + thread.team_barrier(); + + // Reset the Begins values to -1 before releasing the memory pool chunk. + // If you don't do this the next thread that grabs this memory chunk will + // not work properly. Also merges values inside each linked list, because + // there can be duplicate key insertions (these are hopefully rare or else + // performance will suffer) This did not work as a TeamThreadRange, don't + // know why (possibly issues with atomic addition on write_idx) + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *used_hash_count), + [&](const ordinal_t& i) { + ordinal_t dirty_hash = used_hash_indices[i]; + + ordinal_t bucket = hash_begins[dirty_hash]; + + // ascending-key bubble-sort the linked list + // it really do be like that sometimes + ordinal_t end_inner = ORD_MAX; + while (end_inner != bucket) { + ordinal_t last_idx = bucket; + ordinal_t last_key = keys[last_idx]; + scalar_t last_val = values[last_idx]; + bool is_sorted = true; + // bubble-up + for (ordinal_t k = hash_nexts[bucket]; k != end_inner; + k = hash_nexts[k]) { + // swap + if (keys[k] < last_key) { + keys[last_idx] = keys[k]; + values[last_idx] = values[k]; + keys[k] = last_key; + values[k] = last_val; + is_sorted = false; + } + // increment last + last_key = keys[k]; + last_val = values[k]; + last_idx = k; + } + end_inner = last_idx; + if (is_sorted) { + // end the outer loop + end_inner = bucket; + } + } + ordinal_t key = keys[bucket]; + scalar_t val = values[bucket]; + ordinal_t last = bucket; + // merge linked list and write out + for (ordinal_t j = hash_nexts[bucket]; j != ORD_MAX; + j = hash_nexts[j]) { + if (keys[j] == key) { + val += values[j]; + } else { + ordinal_t write_at = + row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); + entries_out(write_at) = key; + if (use_out) { + // reuse wgts_in as scratch space because we are overwriting + // working memory if we use wgts_out + wgts_in(write_at) = val; + } else { + wgts_out(write_at) = val; + } + key = keys[j]; + val = values[j]; + } + hash_nexts[last] = ORD_MAX; + last = j; + } + hash_nexts[last] = ORD_MAX; + // write out the final entry in linked list + ordinal_t write_at = + row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1); + entries_out(write_at) = key; + if (use_out) { + // reuse wgts_in as scratch space because we are overwriting + // working memory if we use wgts_out + wgts_in(write_at) = val; + } else { + wgts_out(write_at) = val; + } + hash_begins[dirty_hash] = ORD_MAX; + }); + thread.team_barrier(); + // need to copy from wgts_in to wgts_out if we used wgts_in as scratch + // space + if (use_out) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *write_idx), + [&](const ordinal_t& i) { + wgts_out(row_map(idx) + i) = wgts_in(row_map(idx) + i); + }); + } + + Kokkos::single(Kokkos::PerTeam(thread), [&]() { + // used_hash_size gives the number of entries, used_hash_count gives the + // number of dirty hash values (I think) + dedupe_edge_count(idx) = *write_idx; + // Release the memory pool chunk back to the pool + memory_pool.release_chunk(ptr_memory_pool_chunk); + }); + + } // operator() + + }; // functorHashmapAccumulator + + static void getHashmapSizeAndCount( + coarsen_handle& handle, const ordinal_t n, + const ordinal_t remaining_count, vtx_view_t remaining, + vtx_view_t edges_per_source, ordinal_t& hash_size, ordinal_t& max_entries, + ordinal_t& mem_chunk_size, ordinal_t& mem_chunk_count) { + ordinal_t avg_entries = 0; + if (!is_host_space && + static_cast(remaining_count) / static_cast(n) > 0.01) { + Kokkos::parallel_reduce( + "calc average among remaining", policy_t(0, remaining_count), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) { + ordinal_t u = remaining(i); + ordinal_t degree = edges_per_source(u); + thread_sum += degree; + }, + avg_entries); + // degrees are often skewed so we want to err on the side of bigger + // hashmaps + avg_entries = avg_entries * 2 / remaining_count; + avg_entries++; + if (avg_entries < 50) avg_entries = 50; + } else { + Kokkos::parallel_reduce( + "calc max", policy_t(0, remaining_count), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_max) { + ordinal_t u = remaining(i); + ordinal_t degree = edges_per_source(u); + if (degree > thread_max) { + thread_max = degree; + } + }, + Kokkos::Max(avg_entries)); + // need precisely one larger than max, don't remember why atm + avg_entries++; + } + + // Set the hash_size as the next power of 2 bigger than hash_size_hint. + // - hash_size must be a power of two since we use & rather than % (which is + // slower) for computing the hash value for HashmapAccumulator. + max_entries = avg_entries; + hash_size = 1; + while (hash_size < max_entries) { + hash_size *= 2; + } + + // Determine memory chunk size for UniformMemoryPool + mem_chunk_size = hash_size; // for hash indices + mem_chunk_size += hash_size; // for hash begins + mem_chunk_size += + 3 * max_entries; // for hash nexts, keys, and values (unless scalar_t + // != ordinal_t, in which case memory is unused) + mem_chunk_size += 10; // for metadata + mem_chunk_count = exec_space::concurrency(); + if (mem_chunk_count > remaining_count) { + mem_chunk_count = remaining_count + 1; + } + + if (!is_host_space) { + // decrease number of mem_chunks to reduce memory usage if necessary + size_t mem_needed = static_cast(mem_chunk_count) * + static_cast(mem_chunk_size) * + sizeof(ordinal_t); + //~500MB + size_t max_mem_allowed = handle.max_mem_allowed; + if (mem_needed > max_mem_allowed) { + size_t chunk_dif = mem_needed - max_mem_allowed; + chunk_dif = chunk_dif / + (static_cast(mem_chunk_size) * sizeof(ordinal_t)); + chunk_dif++; + mem_chunk_count -= chunk_dif; + } + } + } + + static void deduplicate_graph(coarsen_handle& handle, const ordinal_t n, + const bool use_team, + vtx_view_t edges_per_source, + vtx_view_t dest_by_source, + wgt_view_t wgt_by_source, + const edge_view_t source_bucket_offset, + edge_offset_t& gc_nedges) { + if (handle.b == Hashmap || is_host_space) { + ordinal_t remaining_count = n; + vtx_view_t remaining("remaining vtx", n); + Kokkos::parallel_for( + policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i) { remaining(i) = i; }); + // deduplicate rows in phases starting with the small degree rows so we + // can use small hashmaps increase the hashmap size each phase to the + // necessary size for twice the average of remaining rows + wgt_view_t wgt_out = wgt_by_source; + if (!scal_eq_ord && !is_host_space) { + // only necessary if teams might be used + wgt_out = wgt_view_t("wgts out", wgt_by_source.extent(0)); + } + do { + // determine size for hashmap + ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count; + getHashmapSizeAndCount(handle, n, remaining_count, remaining, + edges_per_source, hash_size, max_entries, + mem_chunk_size, mem_chunk_count); + // Create Uniform Initialized Memory Pool + KokkosKernels::Impl::PoolType pool_type = + KokkosKernels::Impl::ManyThread2OneChunk; + + if (is_host_space) { + pool_type = KokkosKernels::Impl::OneThread2OneChunk; + } + + bool use_dyn = should_use_dyn(n, source_bucket_offset, mem_chunk_count); + + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, + ORD_MAX, pool_type); + + functorHashmapAccumulator hashmapAccumulator( + source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, + wgt_out, edges_per_source, memory_pool, hash_size, max_entries, + remaining, !scal_eq_ord); + + ordinal_t old_remaining_count = remaining_count; + if (!is_host_space && max_entries >= 128) { + Kokkos::parallel_reduce("hashmap time", + team_policy_t(old_remaining_count, 1, 64), + hashmapAccumulator, remaining_count); + } else { + if (use_dyn) { + Kokkos::parallel_reduce( + "hashmap time", + dyn_policy_t(0, old_remaining_count, Kokkos::ChunkSize(128)), + hashmapAccumulator, remaining_count); + } else { + Kokkos::parallel_reduce("hashmap time", + policy_t(0, old_remaining_count), + hashmapAccumulator, remaining_count); + } + } + + if (remaining_count > 0) { + vtx_view_t new_remaining("new remaining vtx", remaining_count); + + Kokkos::parallel_scan( + "move remaining vertices", policy_t(0, old_remaining_count), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, + const bool final) { + ordinal_t u = remaining(i); + if (edges_per_source(u) >= max_entries) { + if (final) { + new_remaining(update) = u; + } + update++; + } + }); + + remaining = new_remaining; + } + } while (remaining_count > 0); + Kokkos::parallel_reduce( + policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { + sum += edges_per_source(i); + }, + gc_nedges); + if (!scal_eq_ord && !is_host_space) { + Kokkos::deep_copy(wgt_by_source, wgt_out); + } + } else if (handle.b == Sort) { + // sort the (implicit) crs matrix + KokkosSparse::sort_crs_matrix(source_bucket_offset, + dest_by_source, wgt_by_source); + + // combine adjacent entries that are equal + if (use_team) { + // thread team version + wgt_view_t wgts_out("wgts after dedupe", wgt_by_source.extent(0)); + vtx_view_t dest_out("dest after dedupe", dest_by_source.extent(0)); + functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, + dest_out, wgt_by_source, wgts_out, + edges_per_source); + Kokkos::parallel_reduce("deduplicated sorted", team_policy_t(n, 64), + deduper, gc_nedges); + Kokkos::deep_copy(wgt_by_source, wgts_out); + Kokkos::deep_copy(dest_by_source, dest_out); + } else { + // no thread team version + functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source, + dest_by_source, wgt_by_source, + wgt_by_source, edges_per_source); + Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, + gc_nedges); + } + + } else if (handle.b == Hybrid) { + wgt_view_t wgt_out = wgt_by_source; + if (!scal_eq_ord && !is_host_space) { + // only necessary if teams might be used + wgt_out = wgt_view_t("wgts out", wgt_by_source.extent(0)); + } + ordinal_t limit = 128; + // sort the (implicit) crs matrix, but only the low degree rows + ordinal_t remaining_count = + KokkosSparse::sort_low_degree_rows_crs_matrix( + source_bucket_offset, dest_by_source, wgt_by_source, limit); + // combine adjacent entries that are equal + { + // no thread team version + functorDedupeLowDegreeAfterSort deduper( + source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, + wgt_out, edges_per_source, limit); + Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper, + gc_nedges); + } + vtx_view_t remaining("remaining vtx", remaining_count); + Kokkos::parallel_scan( + "move remaining vertices", policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, + const bool final) { + if (edges_per_source(i) > limit) { + if (final) { + remaining(update) = i; + } + update++; + } + }); + // deduplicate rows in phases starting with the small degree rows so we + // can use small hashmaps increase the hashmap size each phase to the + // necessary size for twice the average of remaining rows + while (remaining_count > 0) { + // determine size for hashmap + ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count; + getHashmapSizeAndCount(handle, n, remaining_count, remaining, + edges_per_source, hash_size, max_entries, + mem_chunk_size, mem_chunk_count); + // Create Uniform Initialized Memory Pool + KokkosKernels::Impl::PoolType pool_type = + KokkosKernels::Impl::ManyThread2OneChunk; + + if (is_host_space) { + pool_type = KokkosKernels::Impl::OneThread2OneChunk; + } + + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, + ORD_MAX, pool_type); + + functorHashmapAccumulator hashmapAccumulator( + source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source, + wgt_out, edges_per_source, memory_pool, hash_size, max_entries, + remaining, !scal_eq_ord); + + ordinal_t old_remaining_count = remaining_count; + if (!is_host_space && max_entries >= 128) { + Kokkos::parallel_reduce("hashmap time", + dyn_team_policy_t(old_remaining_count, 1, 64), + hashmapAccumulator, remaining_count); + } else { + Kokkos::parallel_reduce("hashmap time", + dyn_policy_t(0, old_remaining_count), + hashmapAccumulator, remaining_count); + } + + if (remaining_count > 0) { + vtx_view_t new_remaining("new remaining vtx", remaining_count); + + Kokkos::parallel_scan( + "move remaining vertices", policy_t(0, old_remaining_count), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, + const bool final) { + ordinal_t u = remaining(i); + if (edges_per_source(u) >= max_entries) { + if (final) { + new_remaining(update) = u; + } + update++; + } + }); + + remaining = new_remaining; + } + } + gc_nedges = 0; + Kokkos::parallel_reduce( + policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { + sum += edges_per_source(i); + }, + gc_nedges); + if (!scal_eq_ord && !is_host_space) { + Kokkos::deep_copy(wgt_by_source, wgt_out); + } + } + } + + struct translationFunctor { + matrix_t vcmap, g; + vtx_view_t mapped_edges, edges_per_source; + edge_view_t source_bucket_offset; + vtx_view_t edges_out; + wgt_view_t wgts_out; + ordinal_t workLength; + + translationFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, + vtx_view_t _edges_per_source, + edge_view_t _source_bucket_offset, vtx_view_t _edges_out, + wgt_view_t _wgts_out) + : vcmap(_vcmap), + g(_g), + mapped_edges(_mapped_edges), + edges_per_source(_edges_per_source), + source_bucket_offset(_source_bucket_offset), + edges_out(_edges_out), + wgts_out(_wgts_out), + workLength(_g.numRows()) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member& t) const { + ordinal_t i = t.league_rank() * t.team_size() + t.team_rank(); + if (i >= workLength) return; + ordinal_t u = vcmap.graph.entries(i); + edge_offset_t start = g.graph.row_map(i); + edge_offset_t end = g.graph.row_map(i + 1); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, start, end), + [=](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + if (u != v) { + // fix this, inefficient + edge_offset_t offset = Kokkos::atomic_fetch_add( + &edges_per_source(u), 1); + + offset += source_bucket_offset(u); + + edges_out(offset) = v; + wgts_out(offset) = g.values(idx); + } + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_t& i) const { + ordinal_t u = vcmap.graph.entries(i); + edge_offset_t start = g.graph.row_map(i); + edge_offset_t end = g.graph.row_map(i + 1); + for (edge_offset_t idx = start; idx < end; idx++) { + ordinal_t v = mapped_edges(idx); + if (u != v) { + // fix this + edge_offset_t offset = + Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + + offset += source_bucket_offset(u); + + edges_out(offset) = v; + wgts_out(offset) = g.values(idx); + } + } + } + }; + + // optimized for regular distribution low degree rows + static coarse_level_triple build_nonskew(coarsen_handle& handle, + const matrix_t g, + const matrix_t vcmap, + vtx_view_t mapped_edges, + vtx_view_t edges_per_source) { + ordinal_t n = g.numRows(); + ordinal_t nc = vcmap.numCols(); + edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); + edge_offset_t gc_nedges = 0; + + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), + prefix_sum(edges_per_source, source_bucket_offset)); + + Kokkos::deep_copy(edges_per_source, static_cast(0)); + + edge_subview_t sbo_subview = Kokkos::subview(source_bucket_offset, nc); + edge_offset_t nnz_pre_dedupe = 0; + Kokkos::deep_copy(nnz_pre_dedupe, sbo_subview); + + vtx_view_t dest_by_source("dest_by_source", nnz_pre_dedupe); + wgt_view_t wgt_by_source("wgt_by_source", nnz_pre_dedupe); + + // translates fine entries into coarse entries and writes into coarse rows + translationFunctor translateF(vcmap, g, mapped_edges, edges_per_source, + source_bucket_offset, dest_by_source, + wgt_by_source); + if (is_host_space) { + bool use_dyn = + should_use_dyn(n, g.graph.row_map, exec_space::concurrency()); + if (use_dyn) { + Kokkos::parallel_for("move edges to coarse matrix", dyn_policy_t(0, n), + translateF); + } else { + Kokkos::parallel_for("move edges to coarse matrix", policy_t(0, n), + translateF); + } + } else { + auto execSpaceEnum = + KokkosKernels::Impl::kk_get_exec_space_type(); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( + n, g.nnz(), execSpaceEnum); + team_policy_t dummy(1, 1, vectorLength); + int teamSize = dummy.team_size_max(translateF, Kokkos::ParallelForTag()); + Kokkos::parallel_for( + "move edges to coarse matrix", + team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), + translateF); + } + + deduplicate_graph(handle, nc, false, edges_per_source, dest_by_source, + wgt_by_source, source_bucket_offset, gc_nedges); + + edge_view_t source_offsets("source_offsets", nc + 1); + + Kokkos::parallel_scan("calc source offsets again", policy_t(0, nc), + prefix_sum(edges_per_source, source_offsets)); + + edge_subview_t edge_total_subview = Kokkos::subview(source_offsets, nc); + Kokkos::deep_copy(gc_nedges, edge_total_subview); + + vtx_view_t dest_idx("dest_idx", gc_nedges); + wgt_view_t wgts("wgts", gc_nedges); + + if (is_host_space) { + bool use_dyn = + should_use_dyn(nc, source_offsets, exec_space::concurrency()); + if (use_dyn) { + Kokkos::parallel_for( + "move deduped edges to new coarse matrix", dyn_policy_t(0, nc), + KOKKOS_LAMBDA(const ordinal_t& u) { + edge_offset_t start_origin = source_bucket_offset(u); + edge_offset_t start_dest = source_offsets(u); + for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) { + dest_idx(start_dest + idx) = dest_by_source(start_origin + idx); + wgts(start_dest + idx) = wgt_by_source(start_origin + idx); + } + }); + } else { + Kokkos::parallel_for( + "move deduped edges to new coarse matrix", policy_t(0, nc), + KOKKOS_LAMBDA(const ordinal_t& u) { + edge_offset_t start_origin = source_bucket_offset(u); + edge_offset_t start_dest = source_offsets(u); + for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) { + dest_idx(start_dest + idx) = dest_by_source(start_origin + idx); + wgts(start_dest + idx) = wgt_by_source(start_origin + idx); + } + }); + } + } else { + Kokkos::parallel_for( + "move deduped edges to new coarse matrix", + team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { + ordinal_t u = thread.league_rank(); + edge_offset_t start_origin = source_bucket_offset(u); + edge_offset_t start_dest = source_offsets(u); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, edges_per_source(u)), + [=](const ordinal_t idx) { + dest_idx(start_dest + idx) = + dest_by_source(start_origin + idx); + wgts(start_dest + idx) = wgt_by_source(start_origin + idx); + }); + }); + } + + graph_type gc_graph(dest_idx, source_offsets); + matrix_t gc("gc", nc, wgts, gc_graph); + + coarse_level_triple next_level; + next_level.mtx = gc; + return next_level; + } + + // forms the explicit matrix created by symmetrizing the implicit matrix + static matrix_t collapse_directed_to_undirected( + const ordinal_t nc, const vtx_view_t source_edge_counts, + const edge_view_t source_row_map, const vtx_view_t source_destinations, + const wgt_view_t source_wgts) { + vtx_view_t coarse_degree("coarse degree", nc); + Kokkos::deep_copy(coarse_degree, source_edge_counts); + + Kokkos::parallel_for( + "count directed edges owned by opposite endpoint", + team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) { + ordinal_t u = thread.league_rank(); + edge_offset_t start = source_row_map(u); + edge_offset_t end = start + source_edge_counts(u); + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end), + [=](const edge_offset_t idx) { + ordinal_t v = source_destinations(idx); + // increment other vertex + Kokkos::atomic_fetch_add(&coarse_degree(v), 1); + }); + }); + + edge_view_t target_row_map("target row map", nc + 1); + + Kokkos::parallel_scan("calc target row map", policy_t(0, nc), + prefix_sum(coarse_degree, target_row_map)); + + Kokkos::deep_copy(coarse_degree, static_cast(0)); + + edge_offset_t coarse_edges_total = 0; + edge_subview_t coarse_edge_total_subview = + Kokkos::subview(target_row_map, nc); + Kokkos::deep_copy(coarse_edges_total, coarse_edge_total_subview); + + vtx_view_t dest_idx("dest_idx", coarse_edges_total); + wgt_view_t wgts("wgts", coarse_edges_total); + + Kokkos::parallel_for( + "move edges into correct size matrix", team_policy_t(nc, Kokkos::AUTO), + functorCollapseDirectedToUndirected( + source_row_map, target_row_map, source_edge_counts, coarse_degree, + source_destinations, dest_idx, source_wgts, wgts)); + + graph_type gc_graph(dest_idx, target_row_map); + matrix_t gc("gc", nc, wgts, gc_graph); + return gc; + } + + // optimized for skewed degree distributions + static coarse_level_triple build_skew(coarsen_handle& handle, + const matrix_t g, const matrix_t vcmap, + vtx_view_t mapped_edges, + vtx_view_t degree_initial) { + ordinal_t n = g.numRows(); + ordinal_t nc = vcmap.numCols(); + edge_offset_t gc_nedges = 0; + + vtx_view_t edges_per_source("edges_per_source", nc); + + // recount with edges only belonging to coarse vertex of smaller degree + // matrix becomes directed + Kokkos::parallel_for( + "recount edges", team_policy_t(n, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { + ordinal_t outer_idx = thread.league_rank(); + ordinal_t u = vcmap.graph.entries(outer_idx); + edge_offset_t start = g.graph.row_map(outer_idx); + edge_offset_t end = g.graph.row_map(outer_idx + 1); + ordinal_t nonLoopEdgesTotal = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(thread, start, end), + [=](const edge_offset_t idx, ordinal_t& local_sum) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (u != v && (degree_less || (degree_equal && u < v))) { + local_sum++; + } + }, + nonLoopEdgesTotal); + Kokkos::single(Kokkos::PerTeam(thread), [=]() { + Kokkos::atomic_add(&edges_per_source(u), nonLoopEdgesTotal); + }); + }); + + edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); + + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), + prefix_sum(edges_per_source, source_bucket_offset)); + edge_subview_t sbo_subview = Kokkos::subview(source_bucket_offset, nc); + edge_offset_t nnz_pre_dedupe = 0; + Kokkos::deep_copy(nnz_pre_dedupe, sbo_subview); + + Kokkos::deep_copy(edges_per_source, static_cast(0)); + vtx_view_t dest_by_source("dest by source", nnz_pre_dedupe); + wgt_view_t wgt_by_source("wgt by source", nnz_pre_dedupe); + Kokkos::parallel_for( + "combine fine rows", team_policy_t(n, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { + ordinal_t outer_idx = thread.league_rank(); + ordinal_t u = vcmap.graph.entries(outer_idx); + edge_offset_t start = g.graph.row_map(outer_idx); + edge_offset_t end = g.graph.row_map(outer_idx + 1); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, start, end), + [=](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (degree_less || (degree_equal && u < v)) { + edge_offset_t offset = + Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + + offset += source_bucket_offset(u); + + dest_by_source(offset) = v; + wgt_by_source(offset) = g.values(idx); + } + }); + }); + gc_nedges = 0; + + deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, + wgt_by_source, source_bucket_offset, gc_nedges); + + // form the final coarse graph, which requires symmetrizing the matrix + matrix_t gc = collapse_directed_to_undirected( + nc, edges_per_source, source_bucket_offset, dest_by_source, + wgt_by_source); + + coarse_level_triple next_level; + next_level.mtx = gc; + return next_level; + } + + // optimized for very large row sizes caused by lots of duplicate entries + // first translates each fine entry to its coarse vertex label + // deduplicates within each fine row + // combines fine rows into coarse rows + // deduplicates within each coarse row + static coarse_level_triple build_high_duplicity(coarsen_handle& handle, + const matrix_t g, + const matrix_t vcmap, + vtx_view_t mapped_edges, + vtx_view_t degree_initial) { + ordinal_t n = g.numRows(); + ordinal_t nc = vcmap.numCols(); + edge_offset_t gc_nedges = 0; + + vtx_view_t dedupe_count("dedupe count", n); + edge_view_t row_map_copy("row map copy", n + 1); + + // recount fine row sizes with edges only belonging to fine vertex of coarse + // vertex of smaller degree matrix becomes directed + Kokkos::parallel_for( + "recount edges", team_policy_t(n, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { + ordinal_t outer_idx = thread.league_rank(); + ordinal_t u = vcmap.graph.entries(outer_idx); + edge_offset_t start = g.graph.row_map(outer_idx); + edge_offset_t end = g.graph.row_map(outer_idx + 1); + ordinal_t nonLoopEdgesTotal = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(thread, start, end), + [=](const edge_offset_t idx, ordinal_t& local_sum) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (u != v && (degree_less || (degree_equal && u < v))) { + local_sum++; + } + }, + nonLoopEdgesTotal); + Kokkos::single(Kokkos::PerTeam(thread), [=]() { + dedupe_count(outer_idx) = nonLoopEdgesTotal; + }); + }); + + Kokkos::parallel_scan("calc source offsets", policy_t(0, n), + prefix_sum(dedupe_count, row_map_copy)); + // reset counters to 0 + Kokkos::deep_copy(dedupe_count, static_cast(0)); + + edge_subview_t fine_recount_subview = Kokkos::subview(row_map_copy, n); + edge_offset_t fine_recount = 0; + Kokkos::deep_copy(fine_recount, fine_recount_subview); + + vtx_view_t dest_fine("fine to coarse dests", fine_recount); + wgt_view_t wgt_fine("fine to coarse wgts", fine_recount); + + // create a new directed version of the fine matrix + Kokkos::parallel_for( + "move edges to new matrix", team_policy_t(n, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { + ordinal_t outer_idx = thread.league_rank(); + ordinal_t u = vcmap.graph.entries(outer_idx); + edge_offset_t start = g.graph.row_map(outer_idx); + edge_offset_t end = g.graph.row_map(outer_idx + 1); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, start, end), + [=](const edge_offset_t idx) { + ordinal_t v = mapped_edges(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (u != v && (degree_less || (degree_equal && u < v))) { + edge_offset_t offset = + Kokkos::atomic_fetch_add(&dedupe_count(outer_idx), 1); + + offset += row_map_copy(outer_idx); + + dest_fine(offset) = v; + wgt_fine(offset) = g.values(idx); + } + }); + }); + //"delete" these views + Kokkos::resize(mapped_edges, 0); + + // deduplicate coarse adjacencies within each fine row + deduplicate_graph(handle, n, true, dedupe_count, dest_fine, wgt_fine, + row_map_copy, gc_nedges); + + edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1); + vtx_view_t edges_per_source("edges_per_source", nc); + + Kokkos::parallel_for( + "sum fine row sizes", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) { + ordinal_t u = vcmap.graph.entries(i); + Kokkos::atomic_fetch_add(&edges_per_source(u), dedupe_count(i)); + }); + Kokkos::parallel_scan("calc source offsets", policy_t(0, nc), + prefix_sum(edges_per_source, source_bucket_offset)); + Kokkos::deep_copy(edges_per_source, static_cast(0)); + vtx_view_t dest_by_source("dest by source", gc_nedges); + wgt_view_t wgt_by_source("wgt by source", gc_nedges); + Kokkos::parallel_for( + "combine deduped fine rows", team_policy_t(n, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { + ordinal_t outer_idx = thread.league_rank(); + ordinal_t u = vcmap.graph.entries(outer_idx); + edge_offset_t start = row_map_copy(outer_idx); + edge_offset_t end = start + dedupe_count(outer_idx); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, start, end), + [=](const edge_offset_t idx) { + ordinal_t v = dest_fine(idx); + bool degree_less = degree_initial(u) < degree_initial(v); + bool degree_equal = degree_initial(u) == degree_initial(v); + if (degree_less || (degree_equal && u < v)) { + edge_offset_t offset = + Kokkos::atomic_fetch_add(&edges_per_source(u), 1); + + offset += source_bucket_offset(u); + + dest_by_source(offset) = v; + wgt_by_source(offset) = wgt_fine(idx); + } + }); + }); + gc_nedges = 0; + Kokkos::resize(dest_fine, 0); + Kokkos::resize(wgt_fine, 0); + + deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source, + wgt_by_source, source_bucket_offset, gc_nedges); + + // form the final coarse graph, which requires symmetrizing the matrix + matrix_t gc = collapse_directed_to_undirected( + nc, edges_per_source, source_bucket_offset, dest_by_source, + wgt_by_source); + + coarse_level_triple next_level; + next_level.mtx = gc; + return next_level; + } + + struct countingFunctor { + // counts adjancies for each coarse vertex + // also calculates coarse vertex wgts + matrix_t vcmap, g; + vtx_view_t mapped_edges, degree_initial; + vtx_view_t c_vtx_w, f_vtx_w; + ordinal_t workLength; + + countingFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges, + vtx_view_t _degree_initial, vtx_view_t _c_vtx_w, + vtx_view_t _f_vtx_w) + : vcmap(_vcmap), + g(_g), + mapped_edges(_mapped_edges), + degree_initial(_degree_initial), + c_vtx_w(_c_vtx_w), + f_vtx_w(_f_vtx_w), + workLength(_g.numRows()) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member& t) const { + ordinal_t i = t.league_rank() * t.team_size() + t.team_rank(); + if (i >= workLength) return; + ordinal_t u = vcmap.graph.entries(i); + edge_offset_t start = g.graph.row_map(i); + edge_offset_t end = g.graph.row_map(i + 1); + ordinal_t nonLoopEdgesTotal = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(t, start, end), + [=](const edge_offset_t idx, ordinal_t& local_sum) { + ordinal_t v = vcmap.graph.entries(g.graph.entries(idx)); + mapped_edges(idx) = v; + if (u != v) { + local_sum++; + } + }, + nonLoopEdgesTotal); + Kokkos::single(Kokkos::PerThread(t), [=]() { + Kokkos::atomic_add(°ree_initial(u), nonLoopEdgesTotal); + Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i)); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_t& i) const { + ordinal_t u = vcmap.graph.entries(i); + edge_offset_t start = g.graph.row_map(i); + edge_offset_t end = g.graph.row_map(i + 1); + ordinal_t nonLoopEdgesTotal = 0; + for (edge_offset_t idx = start; idx < end; idx++) { + ordinal_t v = vcmap.graph.entries(g.graph.entries(idx)); + mapped_edges(idx) = v; + if (u != v) { + nonLoopEdgesTotal++; + } + } + Kokkos::atomic_add(°ree_initial(u), nonLoopEdgesTotal); + Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i)); + } + }; + + static coarse_level_triple build_coarse_graph(coarsen_handle& handle, + const coarse_level_triple level, + const matrix_t vcmap) { + if (handle.b == Spgemm || handle.b == Spgemm_transpose_first) { + return build_coarse_graph_spgemm(handle, level, vcmap); + } + + matrix_t g = level.mtx; + ordinal_t n = g.numRows(); + ordinal_t nc = vcmap.numCols(); + + vtx_view_t mapped_edges("mapped edges", g.nnz()); + + vtx_view_t degree_initial("edges_per_source", nc); + vtx_view_t f_vtx_w = level.vtx_wgts; + vtx_view_t c_vtx_w = vtx_view_t("coarse vertex weights", nc); + + // count non-self loop edges per coarse vertex + // also computes coarse vertex weights + countingFunctor countF(vcmap, g, mapped_edges, degree_initial, c_vtx_w, + f_vtx_w); + if (is_host_space) { + Kokkos::parallel_for( + "count edges per coarse vertex (also compute coarse vertex weights)", + policy_t(0, n), countF); + } else { + auto execSpaceEnum = + KokkosKernels::Impl::kk_get_exec_space_type(); + int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size( + n, g.nnz(), execSpaceEnum); + team_policy_t dummy(1, 1, vectorLength); + int teamSize = dummy.team_size_max(countF, Kokkos::ParallelForTag()); + // count edges per vertex + Kokkos::parallel_for( + "count edges per coarse vertex (also compute coarse vertex weights)", + team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength), + countF); + } + + // compute max row size and avg row size + // use this to determine most efficient method for building coarse graph + // (for load balance primarily) + edge_offset_t total_unduped = 0; + ordinal_t max_unduped = 0; + Kokkos::parallel_reduce( + "find max", policy_t(0, nc), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& l_max) { + if (l_max <= degree_initial(i)) { + l_max = degree_initial(i); + } + }, + Kokkos::Max(max_unduped)); + Kokkos::parallel_reduce( + "find total", policy_t(0, nc), + KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) { + sum += degree_initial(i); + }, + total_unduped); + ordinal_t avg_unduped = total_unduped / nc; + + coarse_level_triple next_level; + // optimized subroutines for sufficiently irregular graphs or high average + // adjacency rows don't do optimizations if running on CPU (the default host + // space) + if (avg_unduped > (nc / 4) && !is_host_space) { + next_level = + build_high_duplicity(handle, g, vcmap, mapped_edges, degree_initial); + } else if (avg_unduped > 50 && (max_unduped / 10) > avg_unduped && + !is_host_space) { + next_level = build_skew(handle, g, vcmap, mapped_edges, degree_initial); + } else { + next_level = + build_nonskew(handle, g, vcmap, mapped_edges, degree_initial); + } + + next_level.vtx_wgts = c_vtx_w; + next_level.level = level.level + 1; + next_level.interp_mtx = vcmap; + next_level.uniform_weights = false; + return next_level; + } + + static matrix_t generate_coarse_mapping(coarsen_handle& handle, + const matrix_t g, + bool uniform_weights) { + matrix_t interpolation_graph; + int choice = 0; + + switch (handle.h) { + case Match: choice = 0; break; + case MtMetis: choice = 1; break; + default: choice = 0; + } + + switch (handle.h) { + case HECv1: + interpolation_graph = mapper_t::coarsen_HEC(g, uniform_weights); + break; + case Match: + case MtMetis: + interpolation_graph = + mapper_t::coarsen_match(g, uniform_weights, choice); + break; + case MIS2: interpolation_graph = mapper_t::coarsen_mis_2(g); break; + case GOSHv2: interpolation_graph = mapper_t::coarsen_GOSH_v2(g); break; + case GOSHv1: interpolation_graph = mapper_t::coarsen_GOSH(g); break; + } + return interpolation_graph; + } + + // we can support weighted vertices pretty easily, but we don't rn + // this function can't return the generated list directly because of an NVCC + // compiler bug caller must use the get_levels() method after calling this + // function + static void generate_coarse_graphs(coarsen_handle& handle, + const matrix_t fine_g, + bool uniform_weights = false) { + ordinal_t fine_n = fine_g.numRows(); + std::list& levels = handle.results; + levels.clear(); + coarse_level_triple finest; + finest.mtx = fine_g; + // 1-indexed, not zero indexed + finest.level = 1; + finest.uniform_weights = uniform_weights; + vtx_view_t vtx_weights("vertex weights", fine_n); + Kokkos::deep_copy(vtx_weights, static_cast(1)); + finest.vtx_wgts = vtx_weights; + levels.push_back(finest); + while (levels.rbegin()->mtx.numRows() > handle.coarse_vtx_cutoff) { + coarse_level_triple current_level = *levels.rbegin(); + + matrix_t interp_graph = generate_coarse_mapping( + handle, current_level.mtx, current_level.uniform_weights); + + if (interp_graph.numCols() < handle.min_allowed_vtx) { + break; + } + + coarse_level_triple next_level = + build_coarse_graph(handle, current_level, interp_graph); + + levels.push_back(next_level); + + if (levels.size() > handle.max_levels) break; + } + } +}; + +} // end namespace Experimental +} // end namespace KokkosGraph +// exclude from Cuda builds without lambdas enabled +#endif diff --git a/src/graph/KokkosGraph_CoarsenHeuristics.hpp b/src/graph/KokkosGraph_CoarsenHeuristics.hpp new file mode 100644 index 0000000000..a9a5ce3e21 --- /dev/null +++ b/src/graph/KokkosGraph_CoarsenHeuristics.hpp @@ -0,0 +1,1191 @@ +#pragma once +// exclude from Cuda builds without lambdas enabled +#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#include +#include +#include +#include +#include +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosGraph_MIS2.hpp" + +namespace KokkosGraph { + +namespace Experimental { + +template +class coarsen_heuristics { + public: + // define internal types + using matrix_t = crsMat; + using exec_space = typename matrix_t::execution_space; + using mem_space = typename matrix_t::memory_space; + using Device = typename matrix_t::device_type; + using ordinal_t = typename matrix_t::ordinal_type; + using edge_offset_t = typename matrix_t::size_type; + using scalar_t = typename matrix_t::value_type; + using vtx_view_t = typename Kokkos::View; + using wgt_view_t = typename Kokkos::View; + using edge_view_t = typename Kokkos::View; + using edge_subview_t = typename Kokkos::View; + using rand_view_t = typename Kokkos::View; + using graph_type = typename matrix_t::staticcrsgraph_type; + using policy_t = typename Kokkos::RangePolicy; + using team_policy_t = typename Kokkos::TeamPolicy; + using member = typename team_policy_t::member_type; + using part_view_t = typename Kokkos::View; + using pool_t = Kokkos::Random_XorShift64_Pool; + using gen_t = typename pool_t::generator_type; + using hasher_t = Kokkos::pod_hash; + static constexpr ordinal_t get_null_val() { + if (std::is_signed::value) { + return -1; + } else { + return std::numeric_limits::max(); + } + } + static constexpr ordinal_t ORD_MAX = get_null_val(); + + static vtx_view_t generate_permutation(ordinal_t n, pool_t rand_pool) { + rand_view_t randoms("randoms", n); + + Kokkos::parallel_for( + "create random entries", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + gen_t generator = rand_pool.get_state(); + randoms(i) = generator.urand64(); + rand_pool.free_state(generator); + }); + + int t_buckets = 2 * n; + vtx_view_t buckets("buckets", t_buckets); + Kokkos::parallel_for( + "init buckets", policy_t(0, t_buckets), + KOKKOS_LAMBDA(ordinal_t i) { buckets(i) = ORD_MAX; }); + + uint64_t max = std::numeric_limits::max(); + uint64_t bucket_size = max / t_buckets; + Kokkos::parallel_for( + "insert buckets", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t bucket = randoms(i) / bucket_size; + // find the next open bucket + for (;; bucket++) { + if (bucket >= t_buckets) bucket -= t_buckets; + if (buckets(bucket) == ORD_MAX) { + // attempt to insert into bucket + if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket), + ORD_MAX, i)) { + break; + } + } + } + }); + + vtx_view_t permute("permutation", n); + Kokkos::parallel_scan( + "extract permutation", policy_t(0, t_buckets), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { + if (buckets(i) != ORD_MAX) { + if (final) { + permute(update) = buckets(i); + } + update++; + } + }); + + return permute; + } + + // create a mapping when some vertices are already mapped + // hn is a list of vertices such that vertex i wants to aggregate with vertex + // hn(i) + static ordinal_t parallel_map_construct_prefilled( + vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm, + const vtx_view_t hn, Kokkos::View nvertices_coarse) { + vtx_view_t match("match", n); + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (vcmap(i) == ORD_MAX) { + match(i) = ORD_MAX; + } else { + match(i) = n + 1; + } + }); + ordinal_t perm_length = vperm.extent(0); + + // construct mapping using heaviest edges + int swap = 1; + vtx_view_t curr_perm = vperm; + while (perm_length > 0) { + vtx_view_t next_perm("next perm", perm_length); + Kokkos::View next_length("next_length"); + + Kokkos::parallel_for( + policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t u = curr_perm(i); + ordinal_t v = hn(u); + int condition = u < v; + // need to enforce an ordering condition to allow hard-stall + // conditions to be broken + if (condition ^ swap) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, + v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong( + &match(v), ORD_MAX, u)) { + ordinal_t cv = + Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; + } else { + if (vcmap(v) != ORD_MAX) { + vcmap(u) = vcmap(v); + } else { + match(u) = ORD_MAX; + } + } + } + } + }); + Kokkos::fence(); + // add the ones that failed to be reprocessed next round + // maybe count these then create next_perm to save memory? + Kokkos::parallel_for( + policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t u = curr_perm(i); + if (vcmap(u) == ORD_MAX) { + ordinal_t add_next = Kokkos::atomic_fetch_add(&next_length(), 1); + next_perm(add_next) = u; + } + }); + Kokkos::fence(); + swap = swap ^ 1; + Kokkos::deep_copy(perm_length, next_length); + curr_perm = next_perm; + } + ordinal_t nc = 0; + Kokkos::deep_copy(nc, nvertices_coarse); + return nc; + } + + // hn is a list of vertices such that vertex i wants to aggregate with vertex + // hn(i) + static ordinal_t parallel_map_construct(vtx_view_t vcmap, const ordinal_t n, + const vtx_view_t vperm, + const vtx_view_t hn, + const vtx_view_t ordering) { + vtx_view_t match("match", n); + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { match(i) = ORD_MAX; }); + ordinal_t perm_length = n; + Kokkos::View nvertices_coarse("nvertices"); + + // construct mapping using heaviest edges + int swap = 1; + vtx_view_t curr_perm = vperm; + while (perm_length > 0) { + vtx_view_t next_perm("next perm", perm_length); + Kokkos::View next_length("next_length"); + + Kokkos::parallel_for( + policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t u = curr_perm(i); + ordinal_t v = hn(u); + int condition = ordering(u) < ordering(v); + // need to enforce an ordering condition to allow hard-stall + // conditions to be broken + if (condition ^ swap) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, + v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong( + &match(v), ORD_MAX, u)) { + ordinal_t cv = u; + if (v < u) { + cv = v; + } + vcmap(u) = cv; + vcmap(v) = cv; + } else { + if (vcmap(v) != ORD_MAX) { + vcmap(u) = vcmap(v); + } else { + match(u) = ORD_MAX; + } + } + } + } + }); + Kokkos::fence(); + // add the ones that failed to be reprocessed next round + // maybe count these then create next_perm to save memory? + Kokkos::parallel_scan( + policy_t(0, perm_length), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, + const bool final) { + ordinal_t u = curr_perm(i); + if (vcmap(u) == ORD_MAX) { + if (final) { + next_perm(update) = u; + } + update++; + } + if (final && (i + 1) == perm_length) { + next_length() = update; + } + }); + Kokkos::fence(); + swap = swap ^ 1; + Kokkos::deep_copy(perm_length, next_length); + curr_perm = next_perm; + } + Kokkos::parallel_scan( + "assign aggregates", policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t u, ordinal_t& update, const bool final) { + if (vcmap(u) == u) { + if (final) { + vcmap(u) = update; + } + update++; + } else if (final) { + vcmap(u) = vcmap(u) + n; + } + if (final && (u + 1) == n) { + nvertices_coarse() = update; + } + }); + Kokkos::parallel_for( + "propagate aggregates", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { + if (vcmap(u) >= n) { + ordinal_t c_id = vcmap(u) - n; + vcmap(u) = vcmap(c_id); + } + }); + ordinal_t nc = 0; + Kokkos::deep_copy(nc, nvertices_coarse); + return nc; + } + + static part_view_t GOSH_clusters(const matrix_t& g) { + // finds the central vertices for GOSH clusters + // approximately this is a maximal independent set (if you pretend edges + // whose endpoints both exceed degree thresholds don't exist) IS vertices + // are preferred to be vertices with high degree, so it should be small + + ordinal_t n = g.numRows(); + + // 0: unassigned + // 1: in IS + //-1: adjacent to an IS vertex + part_view_t state("psuedo is membership", n); + + ordinal_t unassigned_total = n; + + // gonna keep this as an edge view in case we wanna do weighted degree + edge_view_t degrees("degrees", n); + vtx_view_t unassigned("unassigned vertices", n); + Kokkos::parallel_for( + "populate degrees", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + degrees(i) = g.graph.row_map(i + 1) - g.graph.row_map(i); + unassigned(i) = i; + }); + edge_offset_t threshold = g.nnz() / g.numRows(); + + while (unassigned_total > 0) { + part_view_t tuple_state("tuple state", n); + edge_view_t tuple_degree("tuple rand", n); + vtx_view_t tuple_idx("tuple index", n); + + part_view_t tuple_state_update("tuple state", n); + edge_view_t tuple_degree_update("tuple rand", n); + vtx_view_t tuple_idx_update("tuple index", n); + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) { + tuple_state(i) = state(i); + tuple_degree(i) = degrees(i); + tuple_idx(i) = i; + }); + + Kokkos::parallel_for( + policy_t(0, unassigned_total), KOKKOS_LAMBDA(const ordinal_t i) { + ordinal_t u = unassigned(i); + int max_state = tuple_state(u); + edge_offset_t max_degree = tuple_degree(u); + ordinal_t max_idx = tuple_idx(u); + + for (edge_offset_t j = g.graph.row_map(u); + j < g.graph.row_map(u + 1); j++) { + ordinal_t v = g.graph.entries(j); + bool is_max = false; + if (tuple_state(v) > max_state) { + is_max = true; + } else if (tuple_state(v) == max_state) { + if (tuple_degree(v) > max_degree) { + is_max = true; + } else if (tuple_degree(v) == max_degree) { + if (tuple_idx(v) > max_idx) { + is_max = true; + } + } + } + // pretend edges between two vertices exceeding threshold do not + // exist + if (degrees(u) > threshold && degrees(v) > threshold) { + is_max = false; + } + if (is_max) { + max_state = tuple_state(v); + max_degree = tuple_degree(v); + max_idx = tuple_idx(v); + } + } + tuple_state_update(u) = max_state; + tuple_degree_update(u) = max_degree; + tuple_idx_update(u) = max_idx; + }); + + Kokkos::parallel_for( + policy_t(0, unassigned_total), KOKKOS_LAMBDA(const ordinal_t i) { + ordinal_t u = unassigned(i); + tuple_state(u) = tuple_state_update(u); + tuple_degree(u) = tuple_degree_update(u); + tuple_idx(u) = tuple_idx_update(u); + }); + + ordinal_t next_unassigned_total = 0; + Kokkos::parallel_reduce( + policy_t(0, unassigned_total), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) { + ordinal_t u = unassigned(i); + if (state(u) == 0) { + if (tuple_idx(u) == u) { + state(u) = 1; + } + // check if at least one of neighbors are in the IS or will be + // placed into the IS + else if (tuple_state(u) == 1 || + tuple_idx(tuple_idx(u)) == tuple_idx(u)) { + state(u) = -1; + } + } + if (state(u) == 0) { + thread_sum++; + } + }, + next_unassigned_total); + + vtx_view_t next_unassigned("next unassigned", next_unassigned_total); + Kokkos::parallel_scan( + "create next unassigned", policy_t(0, unassigned_total), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, + const bool final) { + ordinal_t u = unassigned(i); + if (state(u) == 0) { + if (final) { + next_unassigned(update) = u; + } + update++; + } + }); + unassigned_total = next_unassigned_total; + unassigned = next_unassigned; + } + return state; + } + + static matrix_t coarsen_mis_2(const matrix_t& g) { + ordinal_t n = g.numRows(); + + typename matrix_t::staticcrsgraph_type::entries_type::non_const_value_type + nc = 0; + vtx_view_t vcmap = KokkosGraph::graph_mis2_aggregate< + Device, typename matrix_t::staticcrsgraph_type::row_map_type, + typename matrix_t::staticcrsgraph_type::entries_type, vtx_view_t>( + g.graph.row_map, g.graph.entries, nc); + + edge_view_t row_map("interpolate row map", n + 1); + + Kokkos::parallel_for( + policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; }); + + vtx_view_t entries("interpolate entries", n); + wgt_view_t values("interpolate values", n); + // compute the interpolation weights + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { + entries(u) = vcmap(u); + values(u) = 1.0; + }); + + graph_type graph(entries, row_map); + matrix_t interp("interpolate", nc, values, graph); + + return interp; + } + + static matrix_t coarsen_GOSH(const matrix_t& g) { + ordinal_t n = g.numRows(); + + part_view_t colors = GOSH_clusters(g); + + Kokkos::View nvc("nvertices_coarse"); + vtx_view_t vcmap("vcmap", n); + + int first_color = 1; + + // create aggregates for color 1 + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (colors(i) == first_color) { + vcmap(i) = Kokkos::atomic_fetch_add(&nvc(), 1); + } else { + vcmap(i) = ORD_MAX; + } + }); + + // add unaggregated vertices to aggregate of highest degree neighbor + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (colors(i) != first_color) { + // could use a thread team here + edge_offset_t max_degree = 0; + for (edge_offset_t j = g.graph.row_map(i); + j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + edge_offset_t degree = + g.graph.row_map(v + 1) - g.graph.row_map(v); + if (colors(v) == first_color && degree > max_degree) { + max_degree = degree; + vcmap(i) = vcmap(v); + } + } + } + }); + + ordinal_t nc = 0; + Kokkos::deep_copy(nc, nvc); + + edge_view_t row_map("interpolate row map", n + 1); + + Kokkos::parallel_for( + policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; }); + + vtx_view_t entries("interpolate entries", n); + wgt_view_t values("interpolate values", n); + // compute the interpolation weights + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { + entries(u) = vcmap(u); + values(u) = 1.0; + }); + + graph_type graph(entries, row_map); + matrix_t interp("interpolate", nc, values, graph); + + return interp; + } + + static matrix_t coarsen_GOSH_v2(const matrix_t& g) { + ordinal_t n = g.numRows(); + + Kokkos::View nvc("nvertices_coarse"); + vtx_view_t vcmap("vcmap", n); + + edge_offset_t threshold_d = g.nnz() / n; + if (threshold_d < 50) { + threshold_d = 50; + } + // create aggregates for large degree vtx + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (g.graph.row_map(i + 1) - g.graph.row_map(i) > threshold_d) { + ordinal_t cv = Kokkos::atomic_fetch_add(&nvc(), 1); + vcmap(i) = cv; + } else { + vcmap(i) = ORD_MAX; + } + }); + + // add vertex to max wgt neighbor's aggregate + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (vcmap(i) == ORD_MAX) { + ordinal_t argmax = ORD_MAX; + scalar_t max_w = 0; + for (edge_offset_t j = g.graph.row_map(i); + j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + ordinal_t wgt = g.values(j); + if (vcmap(v) != ORD_MAX) { + if (wgt >= max_w) { + max_w = wgt; + argmax = v; + } + } + } + if (argmax != ORD_MAX) { + vcmap(i) = vcmap(argmax); + } + } + }); + + // add vertex to max degree neighbor's aggregate + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (vcmap(i) == ORD_MAX) { + ordinal_t argmax = ORD_MAX; + edge_offset_t max_d = 0; + for (edge_offset_t j = g.graph.row_map(i); + j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + edge_offset_t degree = + g.graph.row_map(v + 1) - g.graph.row_map(v); + if (vcmap(v) != ORD_MAX) { + if (degree >= max_d) { + max_d = degree; + argmax = v; + } + } + } + if (argmax != ORD_MAX) { + vcmap(i) = vcmap(argmax); + } + } + }); + + // add neighbors of each aggregated vertex to aggregate + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (vcmap(i) != ORD_MAX) { + for (edge_offset_t j = g.graph.row_map(i); + j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + if (vcmap(v) == ORD_MAX) { + vcmap(v) = vcmap(i); + } + } + } + }); + + ordinal_t remaining_total = 0; + + Kokkos::parallel_reduce( + "count remaining", policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& sum) { + if (vcmap(i) == ORD_MAX) { + sum++; + } + }, + remaining_total); + + vtx_view_t remaining("remaining vtx", remaining_total); + + Kokkos::parallel_scan( + "count remaining", policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) { + if (vcmap(i) == ORD_MAX) { + if (final) { + remaining(update) = i; + } + update++; + } + }); + + vtx_view_t hn("heaviest neighbors", n); + + pool_t rand_pool(std::time(nullptr)); + + Kokkos::parallel_for( + "fill hn", policy_t(0, remaining_total), + KOKKOS_LAMBDA(ordinal_t r_idx) { + // select heaviest neighbor with ties randomly broken + ordinal_t i = remaining(r_idx); + ordinal_t hn_i = ORD_MAX; + uint64_t max_rand = 0; + scalar_t max_ewt = 0; + + edge_offset_t end_offset = g.graph.row_map(i + 1); + for (edge_offset_t j = g.graph.row_map(i); j < end_offset; j++) { + scalar_t wgt = g.values(j); + ordinal_t v = g.graph.entries(j); + gen_t generator = rand_pool.get_state(); + uint64_t rand = generator.urand64(); + rand_pool.free_state(generator); + bool choose = false; + if (max_ewt < wgt) { + choose = true; + } else if (max_ewt == wgt && max_rand <= rand) { + choose = true; + } + + if (choose) { + max_ewt = wgt; + max_rand = rand; + hn_i = v; + } + } + hn(i) = hn_i; + }); + + ordinal_t nc = + parallel_map_construct_prefilled(vcmap, n, remaining, hn, nvc); + Kokkos::deep_copy(nc, nvc); + + edge_view_t row_map("interpolate row map", n + 1); + + Kokkos::parallel_for( + policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; }); + + vtx_view_t entries("interpolate entries", n); + wgt_view_t values("interpolate values", n); + // compute the interpolation weights + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { + entries(u) = vcmap(u); + values(u) = 1.0; + }); + + graph_type graph(entries, row_map); + matrix_t interp("interpolate", nc, values, graph); + + return interp; + } + + static matrix_t coarsen_HEC(const matrix_t& g, bool uniform_weights) { + ordinal_t n = g.numRows(); + + vtx_view_t hn("heavies", n); + + vtx_view_t vcmap("vcmap", n); + + Kokkos::parallel_for( + "initialize vcmap", policy_t(0, n), + KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); + + pool_t rand_pool(std::time(nullptr)); + + vtx_view_t vperm = generate_permutation(n, rand_pool); + + vtx_view_t reverse_map("reversed", n); + Kokkos::parallel_for( + "construct reverse map", policy_t(0, n), + KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); + + if (uniform_weights) { + // all weights equal at this level so choose heaviest edge randomly + Kokkos::parallel_for( + "Random HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + gen_t generator = rand_pool.get_state(); + ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); + if (adj_size > 0) { + ordinal_t offset = + g.graph.row_map(i) + (generator.urand64() % adj_size); + hn(i) = g.graph.entries(offset); + } else { + hn(i) = generator.urand64() % n; + } + rand_pool.free_state(generator); + }); + } else { + Kokkos::parallel_for( + "Heaviest HN", team_policy_t(n, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { + ordinal_t i = thread.league_rank(); + ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); + if (adj_size > 0) { + edge_offset_t end = g.graph.row_map(i + 1); + typename Kokkos::MaxLoc::value_type argmax{}; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(thread, g.graph.row_map(i), end), + [=](const edge_offset_t idx, + Kokkos::ValLocScalar& local) { + scalar_t wgt = g.values(idx); + if (wgt >= local.val) { + local.val = wgt; + local.loc = idx; + } + }, + Kokkos::MaxLoc(argmax)); + Kokkos::single(Kokkos::PerTeam(thread), [=]() { + ordinal_t h = g.graph.entries(argmax.loc); + hn(i) = h; + }); + } else { + gen_t generator = rand_pool.get_state(); + hn(i) = generator.urand64() % n; + rand_pool.free_state(generator); + } + }); + } + ordinal_t nc = 0; + nc = parallel_map_construct(vcmap, n, vperm, hn, reverse_map); + + edge_view_t row_map("interpolate row map", n + 1); + + Kokkos::parallel_for( + policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; }); + + vtx_view_t entries("interpolate entries", n); + wgt_view_t values("interpolate values", n); + // compute the interpolation weights + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { + entries(u) = vcmap(u); + values(u) = 1.0; + }); + + graph_type graph(entries, row_map); + matrix_t interp("interpolate", nc, values, graph); + + return interp; + } + + static ordinal_t countInf(vtx_view_t target) { + ordinal_t totalInf = 0; + + Kokkos::parallel_reduce( + policy_t(0, target.extent(0)), + KOKKOS_LAMBDA(ordinal_t i, ordinal_t & thread_sum) { + if (target(i) == ORD_MAX) { + thread_sum++; + } + }, + totalInf); + + return totalInf; + } + + struct MatchByHashSorted { + vtx_view_t vcmap, unmapped; + Kokkos::View hashes; + ordinal_t unmapped_total; + Kokkos::View nvertices_coarse; + MatchByHashSorted(vtx_view_t _vcmap, vtx_view_t _unmapped, + Kokkos::View _hashes, + ordinal_t _unmapped_total, + Kokkos::View _nvertices_coarse) + : vcmap(_vcmap), + unmapped(_unmapped), + hashes(_hashes), + unmapped_total(_unmapped_total), + nvertices_coarse(_nvertices_coarse) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_t i, ordinal_t& update, + const bool final) const { + ordinal_t u = unmapped(i); + ordinal_t tentative = 0; + if (i == 0) { + tentative = i; + } else if (hashes(i - 1) != hashes(i)) { + tentative = i; + } + + if (tentative > update) { + update = tentative; + } + + if (final) { + // update should contain the index of the first hash that equals + // hash(i), could be i we want to determine if i is an odd offset from + // update + ordinal_t isOddOffset = (i - update) & 1; + // if even (0 counts as even) we match unmapped(i) with unmapped(i+1) if + // hash(i) == hash(i+1) if odd do nothing + if (isOddOffset == 0) { + if (i + 1 < unmapped_total) { + if (hashes(i) == hashes(i + 1)) { + ordinal_t v = unmapped(i + 1); + vcmap(u) = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(v) = vcmap(u); + } + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile ordinal_t& update, + volatile const ordinal_t& input) const { + if (input > update) update = input; + } + }; + + static matrix_t coarsen_match(const matrix_t& g, bool uniform_weights, + int match_choice) { + ordinal_t n = g.numRows(); + + vtx_view_t hn("heavies", n); + + vtx_view_t vcmap("vcmap", n); + + Kokkos::parallel_for( + "initialize vcmap", policy_t(0, n), + KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; }); + + rand_view_t randoms("randoms", n); + + pool_t rand_pool(std::time(nullptr)); + + vtx_view_t vperm = generate_permutation(n, rand_pool); + + vtx_view_t reverse_map("reversed", n); + Kokkos::parallel_for( + "construct reverse map", policy_t(0, n), + KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; }); + + if (uniform_weights) { + // all weights equal at this level so choose heaviest edge randomly + Kokkos::parallel_for( + "Random HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + gen_t generator = rand_pool.get_state(); + ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i); + ordinal_t offset = + g.graph.row_map(i) + (generator.urand64() % adj_size); + hn(i) = g.graph.entries(offset); + rand_pool.free_state(generator); + }); + } else { + Kokkos::parallel_for( + "Heaviest HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t hn_i = g.graph.entries(g.graph.row_map(i)); + scalar_t max_ewt = g.values(g.graph.row_map(i)); + + edge_offset_t end_offset = + g.graph.row_map(i + 1); // +g.edges_per_source[i]; + + for (edge_offset_t j = g.graph.row_map(i) + 1; j < end_offset; + j++) { + if (max_ewt < g.values(j)) { + max_ewt = g.values(j); + hn_i = g.graph.entries(j); + } + } + hn(i) = hn_i; + }); + } + vtx_view_t match("match", n); + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { match(i) = ORD_MAX; }); + + ordinal_t perm_length = n; + + Kokkos::View nvertices_coarse("nvertices"); + + // construct mapping using heaviest edges + int swap = 1; + while (perm_length > 0) { + vtx_view_t next_perm("next perm", perm_length); + Kokkos::View next_length("next_length"); + + // match vertices with heaviest unmatched edge + Kokkos::parallel_for( + policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t u = vperm(i); + ordinal_t v = hn(u); + int condition = reverse_map(u) < reverse_map(v); + // need to enforce an ordering condition to allow hard-stall + // conditions to be broken + if (condition ^ swap) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, + v)) { + if (u == v || Kokkos::atomic_compare_exchange_strong( + &match(v), ORD_MAX, u)) { + // u == v avoids problems if there is a self-loop edge + ordinal_t cv = + Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; + } else { + match(u) = ORD_MAX; + } + } + } + }); + Kokkos::fence(); + + // add the ones that failed to be reprocessed next round + // maybe count these then create next_perm to save memory? + Kokkos::parallel_for( + policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t u = vperm(i); + if (vcmap(u) == ORD_MAX) { + ordinal_t h = ORD_MAX; + + if (uniform_weights) { + ordinal_t max_ewt = 0; + // we have to iterate over the edges anyways because we need to + // check if any are unmatched! so instead of randomly choosing a + // heaviest edge, we instead use the reverse permutation order + // as the weight + for (edge_offset_t j = g.graph.row_map(u); + j < g.graph.row_map(u + 1); j++) { + ordinal_t v = g.graph.entries(j); + // v must be unmatched to be considered + if (vcmap(v) == ORD_MAX) { + // using <= so that zero weight edges may still be chosen + if (max_ewt <= reverse_map(v)) { + max_ewt = reverse_map(v); + h = v; + } + } + } + } else { + scalar_t max_ewt = 0; + for (edge_offset_t j = g.graph.row_map(u); + j < g.graph.row_map(u + 1); j++) { + ordinal_t v = g.graph.entries(j); + // v must be unmatched to be considered + if (vcmap(v) == ORD_MAX) { + // using <= so that zero weight edges may still be chosen + if (max_ewt <= g.values(j)) { + max_ewt = g.values(j); + h = v; + } + } + } + } + + if (h != ORD_MAX) { + ordinal_t add_next = + Kokkos::atomic_fetch_add(&next_length(), 1); + next_perm(add_next) = u; + hn(u) = h; + } + } + }); + Kokkos::fence(); + swap = swap ^ 1; + Kokkos::deep_copy(perm_length, next_length); + vperm = next_perm; + } + + if (match_choice == 1) { + ordinal_t unmapped = countInf(vcmap); + double unmappedRatio = + static_cast(unmapped) / static_cast(n); + + // leaf matches + if (unmappedRatio > 0.25) { + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { + if (vcmap(u) != ORD_MAX) { + ordinal_t lastLeaf = ORD_MAX; + for (edge_offset_t j = g.graph.row_map(u); + j < g.graph.row_map(u + 1); j++) { + ordinal_t v = g.graph.entries(j); + // v must be unmatched to be considered + if (vcmap(v) == ORD_MAX) { + // must be degree 1 to be a leaf + if (g.graph.row_map(v + 1) - g.graph.row_map(v) == 1) { + if (lastLeaf == ORD_MAX) { + lastLeaf = v; + } else { + vcmap(lastLeaf) = + Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(v) = vcmap(lastLeaf); + lastLeaf = ORD_MAX; + } + } + } + } + } + }); + } + + unmapped = countInf(vcmap); + unmappedRatio = static_cast(unmapped) / static_cast(n); + + // twin matches + if (unmappedRatio > 0.25) { + vtx_view_t unmappedVtx("unmapped vertices", unmapped); + Kokkos::View hashes("hashes", unmapped); + + Kokkos::View unmappedIdx("unmapped index"); + hasher_t hasher; + // compute digests of adjacency lists + Kokkos::parallel_for( + "create digests", team_policy_t(n, Kokkos::AUTO), + KOKKOS_LAMBDA(const member& thread) { + ordinal_t u = thread.league_rank(); + if (vcmap(u) == ORD_MAX) { + uint32_t hash = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(thread, g.graph.row_map(u), + g.graph.row_map(u + 1)), + [=](const edge_offset_t j, uint32_t& thread_sum) { + thread_sum += hasher(g.graph.entries(j)); + }, + hash); + Kokkos::single(Kokkos::PerTeam(thread), [=]() { + ordinal_t idx = Kokkos::atomic_fetch_add(&unmappedIdx(), 1); + unmappedVtx(idx) = u; + hashes(idx) = hash; + }); + } + }); + uint32_t max = std::numeric_limits::max(); + typedef Kokkos::BinOp1D > BinOp; + BinOp bin_op(unmapped, 0, max); + // VERY important that final parameter is true + Kokkos::BinSort, BinOp, exec_space, + ordinal_t> + sorter(hashes, bin_op, true); + sorter.create_permute_vector(); + sorter.template sort >(hashes); + sorter.template sort(unmappedVtx); + + MatchByHashSorted matchTwinFunctor(vcmap, unmappedVtx, hashes, unmapped, + nvertices_coarse); + Kokkos::parallel_scan("match twins", policy_t(0, unmapped), + matchTwinFunctor); + } + + unmapped = countInf(vcmap); + unmappedRatio = static_cast(unmapped) / static_cast(n); + + // relative matches + if (unmappedRatio > 0.25) { + // get possibly mappable vertices of unmapped + vtx_view_t mappableVtx("mappable vertices", unmapped); + Kokkos::parallel_scan( + "get unmapped", policy_t(0, n), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, + const bool final) { + if (vcmap(i) == ORD_MAX) { + if (final) { + mappableVtx(update) = i; + } + + update++; + } + }); + + ordinal_t mappable_count = unmapped; + do { + Kokkos::parallel_for( + "reset hn", policy_t(0, mappable_count), + KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t u = mappableVtx(i); + hn(u) = ORD_MAX; + }); + + // choose relatives for unmapped vertices + Kokkos::parallel_for( + "assign relatives", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (vcmap(i) != ORD_MAX) { + ordinal_t last_free = ORD_MAX; + for (edge_offset_t j = g.graph.row_map(i); + j < g.graph.row_map(i + 1); j++) { + ordinal_t v = g.graph.entries(j); + if (vcmap(v) == ORD_MAX) { + if (last_free != ORD_MAX) { + // there can be multiple threads updating this but it + // doesn't matter as long as they have some value + hn(last_free) = v; + hn(v) = last_free; + last_free = ORD_MAX; + } else { + last_free = v; + } + } + } + } + }); + + // create a list of all mappable vertices according to set entries of + // hn + ordinal_t old_mappable = mappable_count; + mappable_count = 0; + Kokkos::parallel_reduce( + "count mappable", policy_t(0, old_mappable), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) { + ordinal_t u = mappableVtx(i); + if (hn(u) != ORD_MAX) { + thread_sum++; + } + }, + mappable_count); + + vtx_view_t nextMappable("next mappable vertices", mappable_count); + + Kokkos::parallel_scan( + "get next mappable", policy_t(0, old_mappable), + KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, + const bool final) { + ordinal_t u = mappableVtx(i); + if (hn(u) != ORD_MAX) { + if (final) { + nextMappable(update) = u; + } + + update++; + } + }); + mappableVtx = nextMappable; + + // match vertices with chosen relative + if (mappable_count > 0) { + Kokkos::parallel_for( + policy_t(0, mappable_count), KOKKOS_LAMBDA(ordinal_t i) { + ordinal_t u = mappableVtx(i); + ordinal_t v = hn(u); + int condition = reverse_map(u) < reverse_map(v); + // need to enforce an ordering condition to allow hard-stall + // conditions to be broken + if (condition ^ swap) { + if (Kokkos::atomic_compare_exchange_strong(&match(u), + ORD_MAX, v)) { + if (Kokkos::atomic_compare_exchange_strong(&match(v), + ORD_MAX, u)) { + ordinal_t cv = + Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + vcmap(u) = cv; + vcmap(v) = cv; + } else { + match(u) = ORD_MAX; + } + } + } + }); + } + Kokkos::fence(); + swap = swap ^ 1; + } while (mappable_count > 0); + } + } + + // create singleton aggregates of remaining unmatched vertices + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { + if (vcmap(i) == ORD_MAX) { + vcmap(i) = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); + } + }); + + ordinal_t nc = 0; + Kokkos::deep_copy(nc, nvertices_coarse); + + edge_view_t row_map("interpolate row map", n + 1); + + Kokkos::parallel_for( + policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; }); + + vtx_view_t entries("interpolate entries", n); + wgt_view_t values("interpolate values", n); + // compute the interpolation weights + Kokkos::parallel_for( + policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) { + entries(u) = vcmap(u); + values(u) = 1.0; + }); + + graph_type graph(entries, row_map); + matrix_t interp("interpolate", nc, values, graph); + + return interp; + } +}; + +} // end namespace Experimental +} // end namespace KokkosGraph +// exclude from Cuda builds without lambdas enabled +#endif diff --git a/unit_test/graph/Test_Graph.hpp b/unit_test/graph/Test_Graph.hpp index 9cce1f52b2..c2aeeab9d3 100644 --- a/unit_test/graph/Test_Graph.hpp +++ b/unit_test/graph/Test_Graph.hpp @@ -5,6 +5,7 @@ #include "Test_Graph_graph_color_distance2.hpp" #include "Test_Graph_graph_color.hpp" #include "Test_Graph_mis2.hpp" +#include "Test_Graph_coarsen.hpp" #include "Test_Graph_rcm.hpp" #endif // TEST_GRAPH_HPP diff --git a/unit_test/graph/Test_Graph_coarsen.hpp b/unit_test/graph/Test_Graph_coarsen.hpp new file mode 100644 index 0000000000..af8ec55a23 --- /dev/null +++ b/unit_test/graph/Test_Graph_coarsen.hpp @@ -0,0 +1,481 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Mike Gilbert (msg5334@psu.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include +#include + +#include "KokkosGraph_CoarsenConstruct.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" + +using namespace KokkosKernels; +using namespace KokkosKernels::Experimental; + +using namespace KokkosGraph; +using namespace KokkosGraph::Experimental; + +// namespace Test { + +template +bool verify_coarsening(typename coarsener_t::coarse_level_triple fine_l, + typename coarsener_t::coarse_level_triple coarse_l) { + using crsMat = typename coarsener_t::matrix_t; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using svt = typename coarsener_t::wgt_view_t; + using ordinal_t = typename entries_t::value_type; + using edge_t = typename rowmap_t::value_type; + + crsMat A = fine_l.mtx; + crsMat coarse_A = coarse_l.mtx; + auto f_rowmap = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto c_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + coarse_A.graph.row_map); + typename c_entries_t::HostMirror f_entries = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + typename c_entries_t::HostMirror vcmap = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), coarse_l.interp_mtx.graph.entries); + typename svt::HostMirror few = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + typename svt::HostMirror cew = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.values); + typename entries_t::HostMirror fvw = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), fine_l.vtx_wgts); + typename entries_t::HostMirror cvw = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), coarse_l.vtx_wgts); + ordinal_t f_size = 0; + ordinal_t c_size = 0; + for (ordinal_t i = 0; i < static_cast(fvw.extent(0)); i++) { + f_size += fvw(i); + } + for (ordinal_t i = 0; i < static_cast(cvw.extent(0)); i++) { + c_size += cvw(i); + } + // number of columns in interpolation matrix should give number of rows in + // coarse matrix + if (coarse_l.interp_mtx.numCols() != coarse_l.mtx.numRows()) { + return false; + } + // sum of vertex weights in each graph should be equal + if (f_size != c_size) { + return false; + } + typename svt::value_type f_edges = 0, c_edges = 0; + for (ordinal_t i = 0; i < A.numRows(); i++) { + for (edge_t j = f_rowmap(i); j < f_rowmap(i + 1); j++) { + ordinal_t v = f_entries(j); + if (vcmap(i) != vcmap(v)) { + f_edges += few(j); + } + } + } + for (ordinal_t i = 0; i < coarse_A.numRows(); i++) { + for (edge_t j = c_rowmap(i); j < c_rowmap(i + 1); j++) { + c_edges += cew(j); + } + } + // sum of inter-aggregate edges in fine graph should be sum of all edges in + // coarse graph + if (f_edges != c_edges) { + return false; + } + return true; +} + +template +bool verify_is_graph(crsMat A) { + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using ordinal_t = typename entries_t::value_type; + using edge_t = typename rowmap_t::value_type; + auto rowmap = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + typename c_entries_t::HostMirror entries = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + + for (ordinal_t i = 0; i < A.numRows(); i++) { + std::set adjset; + for (edge_t j = rowmap(i); j < rowmap(i + 1); j++) { + ordinal_t v = entries(j); + // A should not contain out-of-bounds columns + if (v >= A.numRows()) { + return false; + } + // Each row should not contain duplicate columns + if (adjset.find(v) != adjset.end()) { + return false; + } + adjset.insert(v); + } + } + return true; +} + +template +bool verify_aggregator(crsMat A, crsMat agg) { + using graph_type = typename crsMat::StaticCrsGraphType; + using c_entries_t = typename graph_type::entries_type; + using entries_t = typename c_entries_t::non_const_type; + using ordinal_t = typename entries_t::value_type; + + // aggregator should have as many rows as A + if (A.numRows() != agg.numRows()) { + return false; + } + // aggregator should have as many entries as A has rows + if (A.numRows() != static_cast(agg.nnz())) { + return false; + } + // column count gives number of vertices in coarse graph + // aggregator should not have more columns than A has rows + // it is valid for coarse graph to be same size as input graph + // some graphs (for example a graph with only self-loops) may produce + // coarsenings that don't shrink the graph + if (A.numRows() < agg.numCols()) { + return false; + } + typename c_entries_t::HostMirror entries = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + agg.graph.entries); + + std::vector aggregateSizes(agg.numCols(), 0); + for (ordinal_t i = 0; i < static_cast(agg.nnz()); i++) { + ordinal_t v = entries(i); + // aggregator should not have out-of-bounds columns + if (v >= agg.numCols()) { + return false; + } + aggregateSizes[v]++; + } + for (ordinal_t i = 0; i < agg.numCols(); i++) { + // each aggregate label should contain at least one fine vertex + if (aggregateSizes[i] == 0) { + return false; + } + } + return true; +} + +template +crsMat gen_grid() { + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using svt = typename crsMat::values_type; + using lno_t = typename crsMat::ordinal_type; + using scalar = typename crsMat::value_type; + lno_t rows = 200; + lno_t cols = 300; + lno_t total_vtx = rows * cols; + + // create a 2D-mesh + rowmap_t row_map("rowmap", total_vtx + 1); + auto rm = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), row_map); + rm(0) = 0; + for (lno_t i = 0; i < rows; i++) { + for (lno_t j = 0; j < cols; j++) { + lno_t vtx_id = i * cols + j; + lno_t edge_count = 0; + if (i > 0) edge_count++; + if (i + 1 < rows) edge_count++; + if (j > 0) edge_count++; + if (j + 1 < cols) edge_count++; + rm(vtx_id + 1) = rm(vtx_id) + edge_count; + } + } + lno_t total_edges = rm(total_vtx); + Kokkos::deep_copy(row_map, rm); + entries_t entries("entries", total_edges); + auto e = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + + for (lno_t i = 0; i < rows; i++) { + for (lno_t j = 0; j < cols; j++) { + lno_t vtx_id = i * cols + j; + lno_t write_offset = rm(vtx_id); + if (i > 0) { + e(write_offset) = vtx_id - cols; + write_offset++; + } + if (i + 1 < rows) { + e(write_offset) = vtx_id + cols; + write_offset++; + } + if (j > 0) { + e(write_offset) = vtx_id - 1; + write_offset++; + } + if (j + 1 < cols) { + e(write_offset) = vtx_id + 1; + } + } + } + Kokkos::deep_copy(entries, e); + graph_type g(entries, row_map); + svt values("values", total_edges); + Kokkos::deep_copy(values, static_cast(1)); + crsMat A("A", total_vtx, values, g); + return A; +} + +template +void test_multilevel_coarsen_grid() { + using crsMat = + KokkosSparse::CrsMatrix; + crsMat A = gen_grid(); + using coarsener_t = coarse_builder; + typename coarsener_t::coarsen_handle handle; + using clt = typename coarsener_t::coarse_level_triple; + handle.h = coarsener_t::HECv1; + handle.b = coarsener_t::Hybrid; + coarsener_t::generate_coarse_graphs(handle, A, true); + std::list levels = handle.results; + auto fine = levels.begin(); + auto coarse = fine; + coarse++; + while (coarse != levels.end()) { + bool correct_aggregator = verify_aggregator(fine->mtx, coarse->interp_mtx); + EXPECT_TRUE(correct_aggregator) + << "Multilevel coarsening produced invalid aggregator on level " + << coarse->level - 1; + bool correct_graph = verify_is_graph(coarse->mtx); + bool correct_coarsening = verify_coarsening(*fine, *coarse); + EXPECT_TRUE(correct_graph) + << "Multilevel coarsening produced invalid graph on level " + << coarse->level; + EXPECT_TRUE(correct_coarsening) + << "Multilevel coarsening produced invalid coarsening on level " + << coarse->level; + fine++; + coarse++; + } +} + +template +void test_coarsen_grid() { + using crsMat = + KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_entries_t = typename graph_type::entries_type; + using entries_t = typename c_entries_t::non_const_type; + crsMat A = gen_grid(); + entries_t vWgts("vertex weights", A.numRows()); + Kokkos::deep_copy(vWgts, static_cast(1)); + using coarsener_t = coarse_builder; + typename coarsener_t::coarsen_handle handle; + using clt = typename coarsener_t::coarse_level_triple; + clt fine_A; + fine_A.mtx = A; + fine_A.vtx_wgts = vWgts; + fine_A.level = 0; + fine_A.uniform_weights = true; + std::vector heuristics = { + coarsener_t::HECv1, coarsener_t::Match, coarsener_t::MtMetis, + coarsener_t::MIS2, coarsener_t::GOSHv1, coarsener_t::GOSHv2}; + std::vector builders = { + coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, + coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; + for (auto h : heuristics) { + handle.h = h; + crsMat aggregator = + coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); + bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator); + EXPECT_TRUE(correct_aggregator) + << "Aggregation heuristic " << static_cast(h) + << " produced invalid aggregator."; + for (auto b : builders) { + handle.b = b; + clt coarse_A = + coarsener_t::build_coarse_graph(handle, fine_A, aggregator); + bool correct_graph = verify_is_graph(coarse_A.mtx); + bool correct_coarsening = + verify_coarsening(fine_A, coarse_A); + EXPECT_TRUE(correct_graph) + << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid graph with aggregation heuristic " + << static_cast(h) << "."; + EXPECT_TRUE(correct_coarsening) + << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid coarsening with aggregation heuristic " + << static_cast(h) << "."; + } + } +} + +template +void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, + lno_t row_size_variance) { + using execution_space = typename device::execution_space; + using crsMat = + KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + using svt = typename crsMat::values_type; + // Generate graph + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( + numVerts, numVerts, nnz, row_size_variance, bandwidth); + auto G = A.graph; + // Symmetrize the graph + rowmap_t symRowmap; + entries_t symEntries; + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< + c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>( + numVerts, G.row_map, G.entries, symRowmap, symEntries); + graph_type GS(symEntries, symRowmap); + svt symValues("sym values", symEntries.extent(0)); + entries_t vWgts("vertex weights", numVerts); + Kokkos::deep_copy(symValues, static_cast(2.5)); + Kokkos::deep_copy(vWgts, static_cast(1)); + crsMat AS("A symmetric", numVerts, symValues, GS); + using coarsener_t = coarse_builder; + typename coarsener_t::coarsen_handle handle; + using clt = typename coarsener_t::coarse_level_triple; + clt fine_A; + fine_A.mtx = AS; + fine_A.vtx_wgts = vWgts; + fine_A.level = 0; + fine_A.uniform_weights = true; + std::vector heuristics = { + coarsener_t::HECv1, coarsener_t::Match, coarsener_t::MtMetis, + coarsener_t::MIS2, coarsener_t::GOSHv1, coarsener_t::GOSHv2}; + std::vector builders = { + coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid, + coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first}; + for (auto h : heuristics) { + handle.h = h; + crsMat aggregator = + coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true); + bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator); + EXPECT_TRUE(correct_aggregator) + << "Aggregation heuristic " << static_cast(h) + << " produced invalid aggregator."; + for (auto b : builders) { + handle.b = b; + clt coarse_A = + coarsener_t::build_coarse_graph(handle, fine_A, aggregator); + bool correct_graph = verify_is_graph(coarse_A.mtx); + bool correct_coarsening = + verify_coarsening(fine_A, coarse_A); + EXPECT_TRUE(correct_graph) + << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid graph with aggregation heuristic " + << static_cast(h) << "."; + EXPECT_TRUE(correct_coarsening) + << "Coarsening with dedupe method " << static_cast(b) + << " produced invalid coarsening with aggregation heuristic " + << static_cast(h) << "."; + } + } +} + +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + graph##_##random_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coarsen_random(5000, 5000 * 20, \ + 1000, 10); \ + test_coarsen_random(50, 50 * 10, 40, 10); \ + test_coarsen_random(5, 5 * 3, 5, 0); \ + } \ + TEST_F( \ + TestCategory, \ + graph##_##grid_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coarsen_grid(); \ + } \ + TEST_F( \ + TestCategory, \ + graph##_##grid_graph_multilevel_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_multilevel_coarsen_grid(); \ + } + +// FIXME_SYCL +#ifndef KOKKOS_ENABLE_SYCL +#if defined(KOKKOSKERNELS_INST_DOUBLE) +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int, int, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +#endif +#endif + +#undef EXECUTE_TEST