From 5930a2b92d425ce54870fc69631de6d66069d112 Mon Sep 17 00:00:00 2001
From: Michael Gilbert <euphoricpoptarts@gmail.com>
Date: Mon, 22 Aug 2022 22:57:08 -0400
Subject: [PATCH] Adding my weighted graph coarsening code into kokkos-kernels
 (#1043)

* adding my wgted graph coarsening code into kokkos-kernels

* graph coarsening unit tests

* updating coarse builder with recent changes

* updating with recent changes

* fixing some details

* more tests; small fixes

* fix some problems

* fix some things; unit tests work now

* change contact

* fixed for pthreads and when openmp/serial are not enabled

* enable spgemm dedupe type in unit test

* clang format

* clang format again

* converting all member functions to static; using a handle instead of member variable parameters

* clang-format

* requested changes

* exclude new coarsening files from compilation for cuda builds without lambdas enabled

* update files after rebase

* clang format

* fix g++ compiler warnings for serial build

* clang format

* 'correct' variable was not needed and has been eliminated

* fix compiler warning for SKX gcc-7 build

* fix randomly failing coarsening test for small random graphs
---
 .../KokkosKernels_HashmapAccumulator.hpp      |  115 +-
 src/graph/KokkosGraph_CoarsenConstruct.hpp    | 1919 +++++++++++++++++
 src/graph/KokkosGraph_CoarsenHeuristics.hpp   | 1191 ++++++++++
 unit_test/graph/Test_Graph.hpp                |    1 +
 unit_test/graph/Test_Graph_coarsen.hpp        |  481 +++++
 5 files changed, 3705 insertions(+), 2 deletions(-)
 create mode 100644 src/graph/KokkosGraph_CoarsenConstruct.hpp
 create mode 100644 src/graph/KokkosGraph_CoarsenHeuristics.hpp
 create mode 100644 unit_test/graph/Test_Graph_coarsen.hpp
diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp
index c6397fd9ea..c988e2f4f7 100644
--- a/src/common/KokkosKernels_HashmapAccumulator.hpp
+++ b/src/common/KokkosKernels_HashmapAccumulator.hpp
@@ -46,8 +46,6 @@
 #include <Kokkos_Atomic.hpp>
 #include <atomic>
 
-//#define HASHMAPACCUMULATOR_ASSERT_ENABLED
-
 namespace KokkosKernels {
 
 namespace Experimental {
@@ -442,6 +440,77 @@ struct HashmapAccumulator {
       keys[my_write_index]   = key;
       values[my_write_index] = value;
 
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA because warps do not go in SIMD fashion
+      // anymore. while some thread might insert my_write_index into linked
+      // list, another thread in the warp might be reading keys in above loop.
+      // before inserting the new value in liked list -- which is done with
+      // atomic exchange below, we make sure that the linked is is complete my
+      // assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm, because no two keys will be inserted into
+      // hashmap at the same time, as rows have unique columns.
+
+      // Neither the compiler nor the execution unit can re-order the line
+      // directly below with the next line performing the atomic_exchange as the
+      // atomic exchange writes to hash_begins[hash] and this line reads from
+      // hash_begins[hash].
+      // This line is needed such that threads of execution can still access the
+      // old linked list, after hash_begins+hash has been atomically overwritten
+      // with my_write_index but before hash_nexts[my_write_index] is
+      // overwritten with hashbeginning. If this line was not here, threads may
+      // not be able to access the dangling linked list since
+      // hash_nexts[my_write_index] would still be -1.
+      hash_nexts[my_write_index] = hash_begins[hash];
+#endif
+
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
+      if (hashbeginning == -1) {
+        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] =
+            hash;
+      }
+      hash_nexts[my_write_index] = hashbeginning;
+      return __insert_success;
+    }
+  }
+
+  // just like vector_atomic_insert_into_hash_mergeAdd_TrackHashes
+  // except uses atomic addition on updating the value
+  // necessary if duplicate key insertions happen simultaneously
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes(
+      const key_type key, const value_type value,
+      volatile size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
+    size_type hash, i, my_write_index, hashbeginning;
+
+    if (key == -1) return __insert_success;
+
+    hash = __compute_hash(key, __hashOpRHS);
+    if (hash != -1) {
+      i = hash_begins[hash];
+
+      for (; i != -1; i = hash_nexts[i]) {
+        if (keys[i] == key) {
+          Kokkos::atomic_add(values + i, value);
+          return __insert_success;
+        }
+      }
+    } else {
+      return __insert_success;
+    }
+
+    my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1));
+
+    if (my_write_index >= __max_value_size) {
+      return __insert_full;
+    } else {
+      keys[my_write_index]   = key;
+      values[my_write_index] = value;
+
 #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
     defined(KOKKOS_ARCH_AMPERE)
       // this is an issue on VOLTA and up because warps do not go in SIMD
@@ -480,6 +549,48 @@ struct HashmapAccumulator {
     }
   }
 
+  KOKKOS_INLINE_FUNCTION
+  int vector_atomic_insert_into_hash_mergeAdd_TrackHashes_no_list(
+      const key_type key, const value_type value, size_type *used_hash_size,
+      size_type *used_hashes) {
+    size_type hash;
+
+    if (key == -1) return __insert_success;
+
+    hash = __compute_hash(key, __hashOpRHS);
+    if (hash != -1) {
+      // loop until an empty hash is found and the key insertion succeeds
+      // if our capacity is at least some constant multiple of the current used
+      // hashes then the expected number of iterations is constant
+      int depth = 0;
+      // add key to hash to ensure no two keys follow the same paths over hashes
+      // add depth to prevent cycles
+      for (;; hash = __compute_hash(hash + key + depth++, __hashOpRHS)) {
+        if (keys[hash] == key) {
+          Kokkos::atomic_add(values + hash, value);
+          return __insert_success;
+        } else if (keys[hash] == -1) {
+          if (Kokkos::atomic_compare_exchange_strong<key_type>(keys + hash, -1,
+                                                               key)) {
+            // should only be here if we used a new hash
+            used_hashes[Kokkos::atomic_fetch_add(used_hash_size,
+                                                 size_type(1))] = hash;
+            Kokkos::atomic_add(values + hash, value);
+            return __insert_success;
+          }
+          // we don't care if we failed if some other thread succeeded with the
+          // same key as ours
+          if (keys[hash] == key) {
+            Kokkos::atomic_add(values + hash, value);
+            return __insert_success;
+          }
+        }
+      }
+    } else {
+      return __insert_success;
+    }
+  }
+
   // NOTE: this is an exact copy of vector_atmoic_insert_into_hash_mergeAdd from
   // https://github.com/kokkos/kokkos-kernels/blob/750fe24508a69ed4dba92bb4a9e17a6094b1a083/src/common/KokkosKernels_HashmapAccumulator.hpp#L442-L502
   template <typename team_member_t>
diff --git a/src/graph/KokkosGraph_CoarsenConstruct.hpp b/src/graph/KokkosGraph_CoarsenConstruct.hpp
new file mode 100644
index 0000000000..250f7873de
--- /dev/null
+++ b/src/graph/KokkosGraph_CoarsenConstruct.hpp
@@ -0,0 +1,1919 @@
+#pragma once
+// exclude from Cuda builds without lambdas enabled
+#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#include <list>
+#include <limits>
+#include <Kokkos_Core.hpp>
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosSparse_spgemm.hpp"
+#include "KokkosSparse_SortCrs.hpp"
+#include "KokkosKernels_HashmapAccumulator.hpp"
+#include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp"
+#include "KokkosGraph_CoarsenHeuristics.hpp"
+
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+struct SortLowDegreeCrsMatrixFunctor {
+  using size_type  = typename rowmap_t::non_const_value_type;
+  using lno_t      = typename entries_t::non_const_value_type;
+  using scalar_t   = typename values_t::non_const_value_type;
+  using team_mem   = typename Kokkos::TeamPolicy<execution_space>::member_type;
+  using value_type = lno_t;
+
+  SortLowDegreeCrsMatrixFunctor(bool usingRangePol, const rowmap_t& _rowmap,
+                                const entries_t& _entries,
+                                const values_t& _values,
+                                const lno_t _degreeLimit)
+      : rowmap(_rowmap),
+        entries(_entries),
+        values(_values),
+        degreeLimit(_degreeLimit) {
+    if (usingRangePol) {
+      entriesAux =
+          entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"),
+                    entries.extent(0));
+      valuesAux =
+          values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"),
+                   values.extent(0));
+    }
+    // otherwise, aux arrays won't be allocated (sorting in place)
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i,
+                                         value_type& reducer) const {
+    size_type rowStart = rowmap(i);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    if (rowNum > degreeLimit) {
+      reducer++;
+      return;
+    }
+    // Radix sort requires unsigned keys for comparison
+    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
+    KokkosKernels::SerialRadixSort2<lno_t, unsigned_lno_t, scalar_t>(
+        (unsigned_lno_t*)entries.data() + rowStart,
+        (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart,
+        valuesAux.data() + rowStart, rowNum);
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t,
+                                         value_type& reducer) const {
+    size_type i        = t.league_rank();
+    size_type rowStart = rowmap(i);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    if (rowNum > degreeLimit) {
+      Kokkos::single(Kokkos::PerTeam(t), [&]() { reducer++; });
+      return;
+    }
+    KokkosKernels::TeamBitonicSort2<lno_t, lno_t, scalar_t, team_mem>(
+        entries.data() + rowStart, values.data() + rowStart, rowNum, t);
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  entries_t entriesAux;
+  values_t values;
+  values_t valuesAux;
+  lno_t degreeLimit;
+};
+
+}  // namespace Impl
+
+// Sort a CRS matrix: within each row, sort entries ascending by column.
+// At the same time, permute the values.
+// Only modifies rows below the degreeLimit
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix(
+    const rowmap_t& rowmap, const entries_t& entries, const values_t& values,
+    const typename entries_t::non_const_value_type degreeLimit) {
+  using lno_t    = typename entries_t::non_const_value_type;
+  using team_pol = Kokkos::TeamPolicy<execution_space>;
+  bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
+  Impl::SortLowDegreeCrsMatrixFunctor<execution_space, rowmap_t, entries_t,
+                                      values_t>
+      funct(useRadix, rowmap, entries, values, degreeLimit);
+  lno_t numRows   = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  lno_t notSorted = 0;
+  if (useRadix) {
+    Kokkos::parallel_reduce("sort_crs_matrix",
+                            Kokkos::RangePolicy<execution_space>(0, numRows),
+                            funct, notSorted);
+  } else {
+    // Try to get teamsize to be largest power of 2 not greater than avg entries
+    // per row
+    // TODO (probably important for performnce): add thread-level sort also, and
+    // use that for small avg degree. But this works for now. probably important
+    // for this particular use case of only low-degree rows
+    int teamSize = 1;
+    lno_t avgDeg = 0;
+    if (numRows) avgDeg = (entries.extent(0) + numRows - 1) / numRows;
+    if (avgDeg > degreeLimit) {
+      avgDeg = degreeLimit;
+    }
+    while (teamSize * 2 * 2 <= avgDeg) {
+      teamSize *= 2;
+    }
+    team_pol temp(numRows, teamSize);
+    teamSize = std::min(teamSize,
+                        temp.team_size_max(funct, Kokkos::ParallelReduceTag()));
+    Kokkos::parallel_reduce("sort_crs_matrix", team_pol(numRows, teamSize),
+                            funct, notSorted);
+  }
+  return notSorted;
+}
+
+}  // namespace KokkosSparse
+
+namespace KokkosGraph {
+
+namespace Experimental {
+
+// this class is not meant to be instantiated
+// think of it like a templated namespace
+template <class crsMat>
+class coarse_builder {
+ public:
+  // define internal types
+  using matrix_t       = crsMat;
+  using exec_space     = typename matrix_t::execution_space;
+  using mem_space      = typename matrix_t::memory_space;
+  using Device         = typename matrix_t::device_type;
+  using ordinal_t      = typename matrix_t::ordinal_type;
+  using edge_offset_t  = typename matrix_t::size_type;
+  using scalar_t       = typename matrix_t::value_type;
+  using vtx_view_t     = Kokkos::View<ordinal_t*, Device>;
+  using wgt_view_t     = Kokkos::View<scalar_t*, Device>;
+  using edge_view_t    = Kokkos::View<edge_offset_t*, Device>;
+  using edge_subview_t = Kokkos::View<edge_offset_t, Device>;
+  using graph_type     = typename matrix_t::staticcrsgraph_type;
+  using policy_t       = Kokkos::RangePolicy<exec_space>;
+  using dyn_policy_t =
+      Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>, exec_space>;
+  using team_policy_t = Kokkos::TeamPolicy<exec_space>;
+  using dyn_team_policy_t =
+      Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, exec_space>;
+  using member               = typename team_policy_t::member_type;
+  using spgemm_kernel_handle = KokkosKernels::Experimental::KokkosKernelsHandle<
+      edge_offset_t, ordinal_t, scalar_t, exec_space, mem_space, mem_space>;
+  using uniform_memory_pool_t =
+      KokkosKernels::Impl::UniformMemoryPool<exec_space, ordinal_t>;
+  using mapper_t = coarsen_heuristics<crsMat>;
+  static constexpr ordinal_t get_null_val() {
+    // this value must line up with the null value used by the hashmap
+    // accumulator
+    if (std::is_signed<ordinal_t>::value) {
+      return -1;
+    } else {
+      return std::numeric_limits<ordinal_t>::max();
+    }
+  }
+  static constexpr ordinal_t ORD_MAX  = get_null_val();
+  static constexpr bool is_host_space = std::is_same<
+      typename exec_space::memory_space,
+      typename Kokkos::DefaultHostExecutionSpace::memory_space>::value;
+  static constexpr bool scal_eq_ord = std::is_same<ordinal_t, scalar_t>::value;
+  // contains matrix and vertex weights corresponding to current level
+  // interp matrix maps previous level to this level
+  struct coarse_level_triple {
+    matrix_t mtx;
+    vtx_view_t vtx_wgts;
+    matrix_t interp_mtx;
+    int level;
+    bool uniform_weights;
+  };
+
+  // define behavior-controlling enums
+  enum Heuristic { HECv1, Match, MtMetis, MIS2, GOSHv1, GOSHv2 };
+  enum Builder { Sort, Hashmap, Hybrid, Spgemm, Spgemm_transpose_first };
+
+  struct coarsen_handle {
+    // internal parameters and data
+    // default heuristic is HEC
+    Heuristic h = HECv1;
+    // default builder is Hybrid
+    Builder b = Hybrid;
+    std::list<coarse_level_triple> results;
+    ordinal_t coarse_vtx_cutoff = 50;
+    ordinal_t min_allowed_vtx   = 10;
+    unsigned int max_levels     = 200;
+    size_t max_mem_allowed      = 536870912;
+  };
+
+  // determine if dynamic scheduling should be used
+  static bool should_use_dyn(
+      const ordinal_t n, const Kokkos::View<const edge_offset_t*, Device> work,
+      int t_count) {
+    bool use_dyn      = false;
+    edge_offset_t max = 0;
+    edge_offset_t min = std::numeric_limits<edge_offset_t>::max();
+    if (is_host_space) {
+      ordinal_t static_size = (n + t_count) / t_count;
+      for (ordinal_t i = 0; i < t_count; i++) {
+        ordinal_t start = i * static_size;
+        ordinal_t end   = start + static_size;
+        if (start > n) start = n;
+        if (end > n) end = n;
+        edge_offset_t size = work(end) - work(start);
+        if (size > max) {
+          max = size;
+        }
+        if (size < min) {
+          min = size;
+        }
+      }
+      if (n > 500000 && max > 5 * min) {
+        use_dyn = true;
+      }
+    }
+    return use_dyn;
+  }
+
+  // build the course graph according to ((B^T A) B) or (B^T (A B)), where B is
+  // aggregator matrix
+  static coarse_level_triple build_coarse_graph_spgemm(
+      coarsen_handle& handle, const coarse_level_triple level,
+      const matrix_t interp_mtx) {
+    vtx_view_t f_vtx_w = level.vtx_wgts;
+    matrix_t g         = level.mtx;
+
+    ordinal_t n  = g.numRows();
+    ordinal_t nc = interp_mtx.numCols();
+
+    matrix_t interp_transpose =
+        KokkosSparse::Impl::transpose_matrix(interp_mtx);
+
+    spgemm_kernel_handle kh;
+    kh.set_team_work_size(64);
+    kh.set_dynamic_scheduling(true);
+    KokkosSparse::SPGEMMAlgorithm spgemm_algorithm =
+        KokkosSparse::SPGEMM_KK_MEMORY;
+    kh.create_spgemm_handle(spgemm_algorithm);
+
+    vtx_view_t adj_coarse;
+    wgt_view_t wgt_coarse;
+    edge_view_t row_map_coarse;
+
+    if (handle.b == Spgemm_transpose_first) {
+      edge_view_t row_map_p1("rows_partial", nc + 1);
+      KokkosSparse::Experimental::spgemm_symbolic(
+          &kh, nc, n, n, interp_transpose.graph.row_map,
+          interp_transpose.graph.entries, false, g.graph.row_map,
+          g.graph.entries, false, row_map_p1);
+
+      // partial-result matrix
+      vtx_view_t entries_p1("adjacencies_partial",
+                            kh.get_spgemm_handle()->get_c_nnz());
+      wgt_view_t values_p1("weights_partial",
+                           kh.get_spgemm_handle()->get_c_nnz());
+
+      KokkosSparse::Experimental::spgemm_numeric(
+          &kh, nc, n, n, interp_transpose.graph.row_map,
+          interp_transpose.graph.entries, interp_transpose.values, false,
+          g.graph.row_map, g.graph.entries, g.values, false, row_map_p1,
+          entries_p1, values_p1);
+
+      row_map_coarse = edge_view_t("rows_coarse", nc + 1);
+      KokkosSparse::Experimental::spgemm_symbolic(
+          &kh, nc, n, nc, row_map_p1, entries_p1, false,
+          interp_mtx.graph.row_map, interp_mtx.graph.entries, false,
+          row_map_coarse);
+      // coarse-graph adjacency matrix
+      adj_coarse =
+          vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz());
+      wgt_coarse =
+          wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz());
+
+      KokkosSparse::Experimental::spgemm_numeric(
+          &kh, nc, n, nc, row_map_p1, entries_p1, values_p1, false,
+          interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values,
+          false, row_map_coarse, adj_coarse, wgt_coarse);
+    } else {
+      edge_view_t row_map_p1("rows_partial", n + 1);
+      KokkosSparse::Experimental::spgemm_symbolic(
+          &kh, n, n, nc, g.graph.row_map, g.graph.entries, false,
+          interp_mtx.graph.row_map, interp_mtx.graph.entries, false,
+          row_map_p1);
+
+      // partial-result matrix
+      vtx_view_t entries_p1("adjacencies_partial",
+                            kh.get_spgemm_handle()->get_c_nnz());
+      wgt_view_t values_p1("weights_partial",
+                           kh.get_spgemm_handle()->get_c_nnz());
+
+      KokkosSparse::Experimental::spgemm_numeric(
+          &kh, n, n, nc, g.graph.row_map, g.graph.entries, g.values, false,
+          interp_mtx.graph.row_map, interp_mtx.graph.entries, interp_mtx.values,
+          false, row_map_p1, entries_p1, values_p1);
+
+      row_map_coarse = edge_view_t("rows_coarse", nc + 1);
+      KokkosSparse::Experimental::spgemm_symbolic(
+          &kh, nc, n, nc, interp_transpose.graph.row_map,
+          interp_transpose.graph.entries, false, row_map_p1, entries_p1, false,
+          row_map_coarse);
+      // coarse-graph adjacency matrix
+      adj_coarse =
+          vtx_view_t("adjacencies_coarse", kh.get_spgemm_handle()->get_c_nnz());
+      wgt_coarse =
+          wgt_view_t("weights_coarse", kh.get_spgemm_handle()->get_c_nnz());
+
+      KokkosSparse::Experimental::spgemm_numeric(
+          &kh, nc, n, nc, interp_transpose.graph.row_map,
+          interp_transpose.graph.entries, interp_transpose.values, false,
+          row_map_p1, entries_p1, values_p1, false, row_map_coarse, adj_coarse,
+          wgt_coarse);
+    }
+
+    // now we must remove self-loop edges
+    edge_view_t nonLoops("nonLoop", nc);
+
+    // gonna reuse this to count non-self loop edges
+    Kokkos::parallel_for(
+        policy_t(0, nc), KOKKOS_LAMBDA(ordinal_t i) { nonLoops(i) = 0; });
+
+    Kokkos::parallel_for(
+        policy_t(0, nc), KOKKOS_LAMBDA(ordinal_t u) {
+          for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1);
+               j++) {
+            if (adj_coarse(j) != u) {
+              nonLoops(u)++;
+            }
+          }
+        });
+
+    edge_view_t row_map_nonloop("nonloop row map", nc + 1);
+
+    Kokkos::parallel_scan(
+        policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& update,
+                                       const bool final) {
+          const edge_offset_t val_i = nonLoops(i);
+          update += val_i;
+          if (final) {
+            row_map_nonloop(i + 1) = update;
+          }
+        });
+
+    edge_subview_t rmn_subview = Kokkos::subview(row_map_nonloop, nc);
+    edge_offset_t rmn          = 0;
+    Kokkos::deep_copy(rmn, rmn_subview);
+
+    vtx_view_t entries_nonloop("nonloop entries", rmn);
+    wgt_view_t values_nonloop("nonloop values", rmn);
+
+    Kokkos::parallel_for(
+        policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t i) { nonLoops(i) = 0; });
+
+    Kokkos::parallel_for(
+        policy_t(0, nc), KOKKOS_LAMBDA(const ordinal_t u) {
+          for (edge_offset_t j = row_map_coarse(u); j < row_map_coarse(u + 1);
+               j++) {
+            if (adj_coarse(j) != u) {
+              edge_offset_t offset    = row_map_nonloop(u) + nonLoops(u)++;
+              entries_nonloop(offset) = adj_coarse(j);
+              values_nonloop(offset)  = wgt_coarse(j);
+            }
+          }
+        });
+    // done removing self-loop edges
+
+    kh.destroy_spgemm_handle();
+
+    graph_type gc_graph(entries_nonloop, row_map_nonloop);
+    matrix_t gc("gc", nc, values_nonloop, gc_graph);
+
+    vtx_view_t c_vtx_w("coarse vtx weights", interp_mtx.numCols());
+    Kokkos::parallel_for(
+        "compute coarse vtx wgts", policy_t(0, n),
+        KOKKOS_LAMBDA(const ordinal_t i) {
+          ordinal_t u = interp_mtx.graph.entries(i);
+          Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i));
+        });
+
+    coarse_level_triple next_level;
+    next_level.mtx             = gc;
+    next_level.vtx_wgts        = c_vtx_w;
+    next_level.level           = level.level + 1;
+    next_level.interp_mtx      = interp_mtx;
+    next_level.uniform_weights = false;
+    return next_level;
+  }
+
+  struct prefix_sum {
+    vtx_view_t input;
+    edge_view_t output;
+
+    prefix_sum(vtx_view_t _input, edge_view_t _output)
+        : input(_input), output(_output) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ordinal_t i, edge_offset_t& update,
+                    const bool final) const {
+      const edge_offset_t val_i = input(i);
+      update += val_i;
+      if (final) {
+        output(i + 1) = update;
+      }
+    }
+  };
+
+  struct functorDedupeLowDegreeAfterSort {
+    // compiler may get confused what the reduction type is without this
+    typedef edge_offset_t value_type;
+
+    edge_view_t row_map;
+    vtx_view_t entries, entriesOut;
+    wgt_view_t wgts, wgtsOut;
+    vtx_view_t dedupe_edge_count;
+    ordinal_t degreeLimit;
+
+    functorDedupeLowDegreeAfterSort(edge_view_t _row_map, vtx_view_t _entries,
+                                    vtx_view_t _entriesOut, wgt_view_t _wgts,
+                                    wgt_view_t _wgtsOut,
+                                    vtx_view_t _dedupe_edge_count,
+                                    ordinal_t _degreeLimit_)
+        : row_map(_row_map),
+          entries(_entries),
+          entriesOut(_entriesOut),
+          wgts(_wgts),
+          wgtsOut(_wgtsOut),
+          dedupe_edge_count(_dedupe_edge_count),
+          degreeLimit(_degreeLimit_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member& thread, edge_offset_t& thread_sum) const {
+      ordinal_t u         = thread.league_rank();
+      edge_offset_t start = row_map(u);
+      edge_offset_t end   = row_map(u + 1);
+      ordinal_t degree    = end - start;
+      if (degree > degreeLimit) {
+        return;
+      }
+      Kokkos::parallel_scan(
+          Kokkos::TeamThreadRange(thread, start, end),
+          [&](const edge_offset_t& i, edge_offset_t& update, const bool final) {
+            if (i == start) {
+              update += 1;
+            } else if (entries(i) != entries(i - 1)) {
+              update += 1;
+            }
+            if (final) {
+              entriesOut(start + update - 1) = entries(i);
+              // requires that wgtsOut be initialized to 0
+              Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i));
+              if (i + 1 == end) {
+                dedupe_edge_count(u) = update;
+              }
+            }
+          });
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(thread, start, start + dedupe_edge_count(u)),
+          [&](const edge_offset_t& i) {
+            entries(i) = entriesOut(i);
+            wgts(i)    = wgtsOut(i);
+          });
+      Kokkos::single(Kokkos::PerTeam(thread),
+                     [&]() { thread_sum += dedupe_edge_count(u); });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ordinal_t& u, edge_offset_t& thread_sum) const {
+      ordinal_t offset = row_map(u);
+      ordinal_t last   = ORD_MAX;
+      ordinal_t degree = row_map(u + 1) - row_map(u);
+      if (degree > degreeLimit) {
+        return;
+      }
+      for (edge_offset_t i = row_map(u); i < row_map(u + 1); i++) {
+        if (last != entries(i)) {
+          entriesOut(offset) = entries(i);
+          wgtsOut(offset)    = wgts(i);
+          last               = entries(offset);
+          offset++;
+        } else {
+          wgtsOut(offset - 1) += wgts(i);
+        }
+      }
+      dedupe_edge_count(u) = offset - row_map(u);
+      thread_sum += offset - row_map(u);
+    }
+  };
+
+  struct functorDedupeAfterSort {
+    // compiler may get confused what the reduction type is without this
+    typedef edge_offset_t value_type;
+
+    edge_view_t row_map;
+    vtx_view_t entries, entriesOut;
+    wgt_view_t wgts, wgtsOut;
+    vtx_view_t dedupe_edge_count;
+
+    functorDedupeAfterSort(edge_view_t _row_map, vtx_view_t _entries,
+                           vtx_view_t _entriesOut, wgt_view_t _wgts,
+                           wgt_view_t _wgtsOut, vtx_view_t _dedupe_edge_count)
+        : row_map(_row_map),
+          entries(_entries),
+          entriesOut(_entriesOut),
+          wgts(_wgts),
+          wgtsOut(_wgtsOut),
+          dedupe_edge_count(_dedupe_edge_count) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member& thread, edge_offset_t& thread_sum) const {
+      ordinal_t u         = thread.league_rank();
+      edge_offset_t start = row_map(u);
+      edge_offset_t end   = row_map(u + 1);
+      Kokkos::parallel_scan(
+          Kokkos::TeamThreadRange(thread, start, end),
+          [&](const edge_offset_t& i, edge_offset_t& update, const bool final) {
+            if (i == start) {
+              update += 1;
+            } else if (entries(i) != entries(i - 1)) {
+              update += 1;
+            }
+            if (final) {
+              entriesOut(start + update - 1) = entries(i);
+              // requires that wgtsOut be initialized to 0
+              Kokkos::atomic_add(&wgtsOut(start + update - 1), wgts(i));
+              if (i + 1 == end) {
+                dedupe_edge_count(u) = update;
+              }
+            }
+          });
+      Kokkos::single(Kokkos::PerTeam(thread),
+                     [&]() { thread_sum += dedupe_edge_count(u); });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ordinal_t& u, edge_offset_t& thread_sum) const {
+      ordinal_t offset = row_map(u);
+      ordinal_t last   = ORD_MAX;
+      for (edge_offset_t i = row_map(u); i < row_map(u + 1); i++) {
+        if (last != entries(i)) {
+          entries(offset) = entries(i);
+          wgtsOut(offset) = wgts(i);
+          last            = entries(offset);
+          offset++;
+        } else {
+          wgtsOut(offset - 1) += wgts(i);
+        }
+      }
+      dedupe_edge_count(u) = offset - row_map(u);
+      thread_sum += offset - row_map(u);
+    }
+  };
+
+  struct functorCollapseDirectedToUndirected {
+    const edge_view_t source_row_map;
+    const edge_view_t target_row_map;
+    const vtx_view_t source_edge_counts;
+    vtx_view_t target_edge_counts;
+    const vtx_view_t source_destinations;
+    vtx_view_t target_destinations;
+    const wgt_view_t source_wgts;
+    wgt_view_t target_wgts;
+
+    functorCollapseDirectedToUndirected(
+        const edge_view_t _source_row_map, const edge_view_t _target_row_map,
+        const vtx_view_t _source_edge_counts, vtx_view_t _target_edge_counts,
+        const vtx_view_t _source_destinations, vtx_view_t _target_destinations,
+        const wgt_view_t _source_wgts, wgt_view_t _target_wgts)
+        : source_row_map(_source_row_map),
+          target_row_map(_target_row_map),
+          source_edge_counts(_source_edge_counts),
+          target_edge_counts(_target_edge_counts),
+          source_destinations(_source_destinations),
+          target_destinations(_target_destinations),
+          source_wgts(_source_wgts),
+          target_wgts(_target_wgts) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member& thread) const {
+      ordinal_t u                 = thread.league_rank();
+      edge_offset_t u_origin      = source_row_map(u);
+      edge_offset_t u_dest_offset = target_row_map(u);
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(thread, source_edge_counts(u)),
+          [=](const edge_offset_t u_idx) {
+            ordinal_t v                 = source_destinations(u_origin + u_idx);
+            scalar_t wgt                = source_wgts(u_origin + u_idx);
+            edge_offset_t v_dest_offset = target_row_map(v);
+            edge_offset_t v_dest =
+                v_dest_offset +
+                Kokkos::atomic_fetch_add(&target_edge_counts(v), 1);
+            edge_offset_t u_dest =
+                u_dest_offset +
+                Kokkos::atomic_fetch_add(&target_edge_counts(u), 1);
+
+            target_destinations(u_dest) = v;
+            target_wgts(u_dest)         = wgt;
+            target_destinations(v_dest) = u;
+            target_wgts(v_dest)         = wgt;
+          });
+    }
+  };
+
+  struct functorHashmapAccumulator {
+    // compiler may get confused what the reduction type is without this
+    typedef ordinal_t value_type;
+    edge_view_t row_map;
+    vtx_view_t entries_in, entries_out;
+    wgt_view_t wgts_in, wgts_out;
+    vtx_view_t dedupe_edge_count;
+    uniform_memory_pool_t memory_pool;
+    const ordinal_t hash_size;
+    const ordinal_t max_hash_entries;
+    vtx_view_t remaining;
+    bool use_out;
+
+    functorHashmapAccumulator(edge_view_t _row_map, vtx_view_t _entries_in,
+                              vtx_view_t _entries_out, wgt_view_t _wgts_in,
+                              wgt_view_t _wgts_out,
+                              vtx_view_t _dedupe_edge_count,
+                              uniform_memory_pool_t _memory_pool,
+                              const ordinal_t _hash_size,
+                              const ordinal_t _max_hash_entries,
+                              vtx_view_t _remaining, bool _use_out)
+        : row_map(_row_map),
+          entries_in(_entries_in),
+          entries_out(_entries_out),
+          wgts_in(_wgts_in),
+          wgts_out(_wgts_out),
+          dedupe_edge_count(_dedupe_edge_count),
+          memory_pool(_memory_pool),
+          hash_size(_hash_size),
+          max_hash_entries(_max_hash_entries),
+          remaining(_remaining),
+          use_out(_use_out) {}
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_t get_thread_id(const ordinal_t row_index) const {
+#if defined(KOKKOS_ENABLE_SERIAL)
+      if (std::is_same<exec_space, Kokkos::Serial>::value) return 0;
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+      if (std::is_same<exec_space, Kokkos::OpenMP>::value)
+        return Kokkos::OpenMP::impl_hardware_thread_id();
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+      if (std::is_same<exec_space, Kokkos::Threads>::value)
+        return Kokkos::Threads::impl_hardware_thread_id();
+#endif
+      return row_index;
+    }
+
+    // reduces to find total number of rows that were too large
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ordinal_t& rem_idx, ordinal_t& thread_sum) const {
+      ordinal_t idx = remaining(rem_idx);
+      typedef ordinal_t hash_size_type;
+      typedef ordinal_t hash_key_type;
+      typedef scalar_t hash_value_type;
+
+      // can't do this row at current hashmap size
+      ordinal_t hash_entries = row_map(idx + 1) - row_map(idx);
+      if (hash_entries >= max_hash_entries) {
+        thread_sum++;
+        return;
+      }
+      volatile ordinal_t* ptr_temp = nullptr;
+      ordinal_t t_id               = rem_idx;
+      // need to use the hardware thread id if the pool type is
+      // OneThread2OneChunk
+      t_id = get_thread_id(t_id);
+      while (nullptr == ptr_temp) {
+        ptr_temp = (volatile ordinal_t*)(memory_pool.allocate_chunk(t_id));
+      }
+      if (ptr_temp == nullptr) {
+        return;
+      }
+      ordinal_t* ptr_memory_pool_chunk = (ordinal_t*)(ptr_temp);
+
+      // These are updated by Hashmap_Accumulator insert functions.
+      ordinal_t* used_hash_size = (ordinal_t*)(ptr_temp);
+      ptr_temp++;
+      ordinal_t* used_hash_count = (ordinal_t*)(ptr_temp);
+      ptr_temp++;
+      *used_hash_size  = 0;
+      *used_hash_count = 0;
+
+      // hash function is hash_size-1 (note: hash_size must be a power of 2)
+      ordinal_t hash_func_pow2 = hash_size - 1;
+
+      // Set pointer to hash indices
+      ordinal_t* used_hash_indices = (ordinal_t*)(ptr_temp);
+      ptr_temp += hash_size;
+
+      // Set pointer to hash begins
+      ordinal_t* hash_begins = (ordinal_t*)(ptr_temp);
+      ptr_temp += hash_size;
+
+      // Set pointer to hash nexts
+      ordinal_t* hash_nexts = (ordinal_t*)(ptr_temp);
+
+      // Set pointer to hash keys
+      ordinal_t* keys = (ordinal_t*)entries_out.data() + row_map(idx);
+
+      // Set pointer to hash values
+      scalar_t* values = (scalar_t*)wgts_out.data() + row_map(idx);
+
+      KokkosKernels::Experimental::HashmapAccumulator<
+          hash_size_type, hash_key_type, hash_value_type,
+          KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+          hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys,
+                   values);
+
+      for (edge_offset_t i = row_map(idx); i < row_map(idx + 1); i++) {
+        ordinal_t key  = entries_in(i);
+        scalar_t value = wgts_in(i);
+        hash_map.sequential_insert_into_hash_mergeAdd_TrackHashes(
+            key, value, used_hash_size, used_hash_count, used_hash_indices);
+      };
+
+      // Reset the Begins values to -1 before releasing the memory pool chunk.
+      // If you don't do this the next thread that grabs this memory chunk will
+      // not work properly.
+      for (ordinal_t i = 0; i < *used_hash_count; i++) {
+        ordinal_t dirty_hash = used_hash_indices[i];
+
+        hash_map.hash_begins[dirty_hash] = ORD_MAX;
+      };
+
+      // used_hash_size gives the number of entries, used_hash_count gives the
+      // number of dirty hash values (I think)
+      dedupe_edge_count(idx) = *used_hash_size;
+      // Release the memory pool chunk back to the pool
+      memory_pool.release_chunk(ptr_memory_pool_chunk);
+
+    }  // operator()
+
+    // reduces to find total number of rows that were too large
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member& thread, ordinal_t& thread_sum) const {
+      ordinal_t idx = remaining(thread.league_rank());
+      typedef ordinal_t hash_size_type;
+      typedef ordinal_t hash_key_type;
+      typedef scalar_t hash_value_type;
+
+      // can't do this row at current hashmap size
+      ordinal_t hash_entries = row_map(idx + 1) - row_map(idx);
+      if (hash_entries >= max_hash_entries) {
+        Kokkos::single(Kokkos::PerTeam(thread), [&]() { thread_sum++; });
+        thread.team_barrier();
+        return;
+      }
+      volatile ordinal_t* ptr_temp = nullptr;
+      Kokkos::single(
+          Kokkos::PerTeam(thread),
+          [=](volatile ordinal_t*& ptr_write) {
+            // Acquire a chunk from the memory pool using a spin-loop.
+            ptr_write = nullptr;
+            while (nullptr == ptr_write) {
+              ptr_write = (volatile ordinal_t*)(memory_pool.allocate_chunk(
+                  thread.league_rank()));
+            }
+          },
+          ptr_temp);
+      thread.team_barrier();
+      if (ptr_temp == nullptr) {
+        return;
+      }
+      ordinal_t* ptr_memory_pool_chunk = (ordinal_t*)(ptr_temp);
+
+      // These are updated by Hashmap_Accumulator insert functions.
+      ordinal_t* used_hash_size = (ordinal_t*)(ptr_temp);
+      ptr_temp++;
+      ordinal_t* used_hash_count = (ordinal_t*)(ptr_temp);
+      ptr_temp++;
+      ordinal_t* write_idx = (ordinal_t*)(ptr_temp);
+      ptr_temp++;
+      Kokkos::single(Kokkos::PerTeam(thread), [&]() {
+        *used_hash_size  = 0;
+        *used_hash_count = 0;
+        *write_idx       = 0;
+      });
+
+      // hash function is hash_size-1 (note: hash_size must be a power of 2)
+      ordinal_t hash_func_pow2 = hash_size - 1;
+
+      // Set pointer to hash indices
+      ordinal_t* used_hash_indices = (ordinal_t*)(ptr_temp);
+      ptr_temp += hash_size;
+
+      // Set pointer to hash begins
+      ordinal_t* hash_begins = (ordinal_t*)(ptr_temp);
+      ptr_temp += hash_size;
+
+      // Set pointer to hash nexts
+      ordinal_t* hash_nexts = (ordinal_t*)(ptr_temp);
+      ptr_temp += max_hash_entries;
+
+      // Set pointer to hash keys
+      ordinal_t* keys = (ordinal_t*)(ptr_temp);
+      ptr_temp += max_hash_entries;
+
+      // Set pointer to hash values
+      scalar_t* values;
+      if (use_out) {
+        values = (scalar_t*)wgts_out.data() + row_map(idx);
+      } else {
+        values = (scalar_t*)(ptr_temp);
+      }
+
+      KokkosKernels::Experimental::HashmapAccumulator<
+          hash_size_type, hash_key_type, hash_value_type,
+          KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+          hash_map(hash_size, hash_func_pow2, hash_begins, hash_nexts, keys,
+                   values);
+
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(thread, row_map(idx), row_map(idx + 1)),
+          [&](const edge_offset_t& i) {
+            ordinal_t key  = entries_in(i);
+            scalar_t value = wgts_in(i);
+            // duplicate keys may be inserted simultaneously, this causes
+            // problems we must handle later
+            int r =
+                hash_map
+                    .vector_atomic_insert_into_hash_mergeAtomicAdd_TrackHashes(
+                        key, value, used_hash_size, used_hash_count,
+                        used_hash_indices);
+
+            // Check return code
+            if (r) {
+            }
+          });
+      thread.team_barrier();
+
+      // Reset the Begins values to -1 before releasing the memory pool chunk.
+      // If you don't do this the next thread that grabs this memory chunk will
+      // not work properly. Also merges values inside each linked list, because
+      // there can be duplicate key insertions (these are hopefully rare or else
+      // performance will suffer) This did not work as a TeamThreadRange, don't
+      // know why (possibly issues with atomic addition on write_idx)
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *used_hash_count),
+          [&](const ordinal_t& i) {
+            ordinal_t dirty_hash = used_hash_indices[i];
+
+            ordinal_t bucket = hash_begins[dirty_hash];
+
+            // ascending-key bubble-sort the linked list
+            // it really do be like that sometimes
+            ordinal_t end_inner = ORD_MAX;
+            while (end_inner != bucket) {
+              ordinal_t last_idx = bucket;
+              ordinal_t last_key = keys[last_idx];
+              scalar_t last_val  = values[last_idx];
+              bool is_sorted     = true;
+              // bubble-up
+              for (ordinal_t k = hash_nexts[bucket]; k != end_inner;
+                   k           = hash_nexts[k]) {
+                // swap
+                if (keys[k] < last_key) {
+                  keys[last_idx]   = keys[k];
+                  values[last_idx] = values[k];
+                  keys[k]          = last_key;
+                  values[k]        = last_val;
+                  is_sorted        = false;
+                }
+                // increment last
+                last_key = keys[k];
+                last_val = values[k];
+                last_idx = k;
+              }
+              end_inner = last_idx;
+              if (is_sorted) {
+                // end the outer loop
+                end_inner = bucket;
+              }
+            }
+            ordinal_t key  = keys[bucket];
+            scalar_t val   = values[bucket];
+            ordinal_t last = bucket;
+            // merge linked list and write out
+            for (ordinal_t j = hash_nexts[bucket]; j != ORD_MAX;
+                 j           = hash_nexts[j]) {
+              if (keys[j] == key) {
+                val += values[j];
+              } else {
+                ordinal_t write_at =
+                    row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1);
+                entries_out(write_at) = key;
+                if (use_out) {
+                  // reuse wgts_in as scratch space because we are overwriting
+                  // working memory if we use wgts_out
+                  wgts_in(write_at) = val;
+                } else {
+                  wgts_out(write_at) = val;
+                }
+                key = keys[j];
+                val = values[j];
+              }
+              hash_nexts[last] = ORD_MAX;
+              last             = j;
+            }
+            hash_nexts[last] = ORD_MAX;
+            // write out the final entry in linked list
+            ordinal_t write_at =
+                row_map(idx) + Kokkos::atomic_fetch_add(write_idx, 1);
+            entries_out(write_at) = key;
+            if (use_out) {
+              // reuse wgts_in as scratch space because we are overwriting
+              // working memory if we use wgts_out
+              wgts_in(write_at) = val;
+            } else {
+              wgts_out(write_at) = val;
+            }
+            hash_begins[dirty_hash] = ORD_MAX;
+          });
+      thread.team_barrier();
+      // need to copy from wgts_in to wgts_out if we used wgts_in as scratch
+      // space
+      if (use_out) {
+        Kokkos::parallel_for(
+            Kokkos::ThreadVectorRange(thread, (ordinal_t)0, *write_idx),
+            [&](const ordinal_t& i) {
+              wgts_out(row_map(idx) + i) = wgts_in(row_map(idx) + i);
+            });
+      }
+
+      Kokkos::single(Kokkos::PerTeam(thread), [&]() {
+        // used_hash_size gives the number of entries, used_hash_count gives the
+        // number of dirty hash values (I think)
+        dedupe_edge_count(idx) = *write_idx;
+        // Release the memory pool chunk back to the pool
+        memory_pool.release_chunk(ptr_memory_pool_chunk);
+      });
+
+    }  // operator()
+
+  };  // functorHashmapAccumulator
+
+  static void getHashmapSizeAndCount(
+      coarsen_handle& handle, const ordinal_t n,
+      const ordinal_t remaining_count, vtx_view_t remaining,
+      vtx_view_t edges_per_source, ordinal_t& hash_size, ordinal_t& max_entries,
+      ordinal_t& mem_chunk_size, ordinal_t& mem_chunk_count) {
+    ordinal_t avg_entries = 0;
+    if (!is_host_space &&
+        static_cast<double>(remaining_count) / static_cast<double>(n) > 0.01) {
+      Kokkos::parallel_reduce(
+          "calc average among remaining", policy_t(0, remaining_count),
+          KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) {
+            ordinal_t u      = remaining(i);
+            ordinal_t degree = edges_per_source(u);
+            thread_sum += degree;
+          },
+          avg_entries);
+      // degrees are often skewed so we want to err on the side of bigger
+      // hashmaps
+      avg_entries = avg_entries * 2 / remaining_count;
+      avg_entries++;
+      if (avg_entries < 50) avg_entries = 50;
+    } else {
+      Kokkos::parallel_reduce(
+          "calc max", policy_t(0, remaining_count),
+          KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_max) {
+            ordinal_t u      = remaining(i);
+            ordinal_t degree = edges_per_source(u);
+            if (degree > thread_max) {
+              thread_max = degree;
+            }
+          },
+          Kokkos::Max<ordinal_t, Kokkos::HostSpace>(avg_entries));
+      // need precisely one larger than max, don't remember why atm
+      avg_entries++;
+    }
+
+    // Set the hash_size as the next power of 2 bigger than hash_size_hint.
+    // - hash_size must be a power of two since we use & rather than % (which is
+    // slower) for computing the hash value for HashmapAccumulator.
+    max_entries = avg_entries;
+    hash_size   = 1;
+    while (hash_size < max_entries) {
+      hash_size *= 2;
+    }
+
+    // Determine memory chunk size for UniformMemoryPool
+    mem_chunk_size = hash_size;   // for hash indices
+    mem_chunk_size += hash_size;  // for hash begins
+    mem_chunk_size +=
+        3 * max_entries;   // for hash nexts, keys, and values (unless scalar_t
+                           // != ordinal_t, in which case memory is unused)
+    mem_chunk_size += 10;  // for metadata
+    mem_chunk_count = exec_space::concurrency();
+    if (mem_chunk_count > remaining_count) {
+      mem_chunk_count = remaining_count + 1;
+    }
+
+    if (!is_host_space) {
+      // decrease number of mem_chunks to reduce memory usage if necessary
+      size_t mem_needed = static_cast<size_t>(mem_chunk_count) *
+                          static_cast<size_t>(mem_chunk_size) *
+                          sizeof(ordinal_t);
+      //~500MB
+      size_t max_mem_allowed = handle.max_mem_allowed;
+      if (mem_needed > max_mem_allowed) {
+        size_t chunk_dif = mem_needed - max_mem_allowed;
+        chunk_dif        = chunk_dif /
+                    (static_cast<size_t>(mem_chunk_size) * sizeof(ordinal_t));
+        chunk_dif++;
+        mem_chunk_count -= chunk_dif;
+      }
+    }
+  }
+
+  static void deduplicate_graph(coarsen_handle& handle, const ordinal_t n,
+                                const bool use_team,
+                                vtx_view_t edges_per_source,
+                                vtx_view_t dest_by_source,
+                                wgt_view_t wgt_by_source,
+                                const edge_view_t source_bucket_offset,
+                                edge_offset_t& gc_nedges) {
+    if (handle.b == Hashmap || is_host_space) {
+      ordinal_t remaining_count = n;
+      vtx_view_t remaining("remaining vtx", n);
+      Kokkos::parallel_for(
+          policy_t(0, n),
+          KOKKOS_LAMBDA(const ordinal_t i) { remaining(i) = i; });
+      // deduplicate rows in phases starting with the small degree rows so we
+      // can use small hashmaps increase the hashmap size each phase to the
+      // necessary size for twice the average of remaining rows
+      wgt_view_t wgt_out = wgt_by_source;
+      if (!scal_eq_ord && !is_host_space) {
+        // only necessary if teams might be used
+        wgt_out = wgt_view_t("wgts out", wgt_by_source.extent(0));
+      }
+      do {
+        // determine size for hashmap
+        ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count;
+        getHashmapSizeAndCount(handle, n, remaining_count, remaining,
+                               edges_per_source, hash_size, max_entries,
+                               mem_chunk_size, mem_chunk_count);
+        // Create Uniform Initialized Memory Pool
+        KokkosKernels::Impl::PoolType pool_type =
+            KokkosKernels::Impl::ManyThread2OneChunk;
+
+        if (is_host_space) {
+          pool_type = KokkosKernels::Impl::OneThread2OneChunk;
+        }
+
+        bool use_dyn = should_use_dyn(n, source_bucket_offset, mem_chunk_count);
+
+        uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size,
+                                          ORD_MAX, pool_type);
+
+        functorHashmapAccumulator hashmapAccumulator(
+            source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source,
+            wgt_out, edges_per_source, memory_pool, hash_size, max_entries,
+            remaining, !scal_eq_ord);
+
+        ordinal_t old_remaining_count = remaining_count;
+        if (!is_host_space && max_entries >= 128) {
+          Kokkos::parallel_reduce("hashmap time",
+                                  team_policy_t(old_remaining_count, 1, 64),
+                                  hashmapAccumulator, remaining_count);
+        } else {
+          if (use_dyn) {
+            Kokkos::parallel_reduce(
+                "hashmap time",
+                dyn_policy_t(0, old_remaining_count, Kokkos::ChunkSize(128)),
+                hashmapAccumulator, remaining_count);
+          } else {
+            Kokkos::parallel_reduce("hashmap time",
+                                    policy_t(0, old_remaining_count),
+                                    hashmapAccumulator, remaining_count);
+          }
+        }
+
+        if (remaining_count > 0) {
+          vtx_view_t new_remaining("new remaining vtx", remaining_count);
+
+          Kokkos::parallel_scan(
+              "move remaining vertices", policy_t(0, old_remaining_count),
+              KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update,
+                            const bool final) {
+                ordinal_t u = remaining(i);
+                if (edges_per_source(u) >= max_entries) {
+                  if (final) {
+                    new_remaining(update) = u;
+                  }
+                  update++;
+                }
+              });
+
+          remaining = new_remaining;
+        }
+      } while (remaining_count > 0);
+      Kokkos::parallel_reduce(
+          policy_t(0, n),
+          KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) {
+            sum += edges_per_source(i);
+          },
+          gc_nedges);
+      if (!scal_eq_ord && !is_host_space) {
+        Kokkos::deep_copy(wgt_by_source, wgt_out);
+      }
+    } else if (handle.b == Sort) {
+      // sort the (implicit) crs matrix
+      KokkosSparse::sort_crs_matrix<exec_space, edge_view_t, vtx_view_t,
+                                    wgt_view_t>(source_bucket_offset,
+                                                dest_by_source, wgt_by_source);
+
+      // combine adjacent entries that are equal
+      if (use_team) {
+        // thread team version
+        wgt_view_t wgts_out("wgts after dedupe", wgt_by_source.extent(0));
+        vtx_view_t dest_out("dest after dedupe", dest_by_source.extent(0));
+        functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source,
+                                       dest_out, wgt_by_source, wgts_out,
+                                       edges_per_source);
+        Kokkos::parallel_reduce("deduplicated sorted", team_policy_t(n, 64),
+                                deduper, gc_nedges);
+        Kokkos::deep_copy(wgt_by_source, wgts_out);
+        Kokkos::deep_copy(dest_by_source, dest_out);
+      } else {
+        // no thread team version
+        functorDedupeAfterSort deduper(source_bucket_offset, dest_by_source,
+                                       dest_by_source, wgt_by_source,
+                                       wgt_by_source, edges_per_source);
+        Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper,
+                                gc_nedges);
+      }
+
+    } else if (handle.b == Hybrid) {
+      wgt_view_t wgt_out = wgt_by_source;
+      if (!scal_eq_ord && !is_host_space) {
+        // only necessary if teams might be used
+        wgt_out = wgt_view_t("wgts out", wgt_by_source.extent(0));
+      }
+      ordinal_t limit = 128;
+      // sort the (implicit) crs matrix, but only the low degree rows
+      ordinal_t remaining_count =
+          KokkosSparse::sort_low_degree_rows_crs_matrix<exec_space, edge_view_t,
+                                                        vtx_view_t, wgt_view_t>(
+              source_bucket_offset, dest_by_source, wgt_by_source, limit);
+      // combine adjacent entries that are equal
+      {
+        // no thread team version
+        functorDedupeLowDegreeAfterSort deduper(
+            source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source,
+            wgt_out, edges_per_source, limit);
+        Kokkos::parallel_reduce("deduplicated sorted", policy_t(0, n), deduper,
+                                gc_nedges);
+      }
+      vtx_view_t remaining("remaining vtx", remaining_count);
+      Kokkos::parallel_scan(
+          "move remaining vertices", policy_t(0, n),
+          KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update,
+                        const bool final) {
+            if (edges_per_source(i) > limit) {
+              if (final) {
+                remaining(update) = i;
+              }
+              update++;
+            }
+          });
+      // deduplicate rows in phases starting with the small degree rows so we
+      // can use small hashmaps increase the hashmap size each phase to the
+      // necessary size for twice the average of remaining rows
+      while (remaining_count > 0) {
+        // determine size for hashmap
+        ordinal_t hash_size, max_entries, mem_chunk_size, mem_chunk_count;
+        getHashmapSizeAndCount(handle, n, remaining_count, remaining,
+                               edges_per_source, hash_size, max_entries,
+                               mem_chunk_size, mem_chunk_count);
+        // Create Uniform Initialized Memory Pool
+        KokkosKernels::Impl::PoolType pool_type =
+            KokkosKernels::Impl::ManyThread2OneChunk;
+
+        if (is_host_space) {
+          pool_type = KokkosKernels::Impl::OneThread2OneChunk;
+        }
+
+        uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size,
+                                          ORD_MAX, pool_type);
+
+        functorHashmapAccumulator hashmapAccumulator(
+            source_bucket_offset, dest_by_source, dest_by_source, wgt_by_source,
+            wgt_out, edges_per_source, memory_pool, hash_size, max_entries,
+            remaining, !scal_eq_ord);
+
+        ordinal_t old_remaining_count = remaining_count;
+        if (!is_host_space && max_entries >= 128) {
+          Kokkos::parallel_reduce("hashmap time",
+                                  dyn_team_policy_t(old_remaining_count, 1, 64),
+                                  hashmapAccumulator, remaining_count);
+        } else {
+          Kokkos::parallel_reduce("hashmap time",
+                                  dyn_policy_t(0, old_remaining_count),
+                                  hashmapAccumulator, remaining_count);
+        }
+
+        if (remaining_count > 0) {
+          vtx_view_t new_remaining("new remaining vtx", remaining_count);
+
+          Kokkos::parallel_scan(
+              "move remaining vertices", policy_t(0, old_remaining_count),
+              KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update,
+                            const bool final) {
+                ordinal_t u = remaining(i);
+                if (edges_per_source(u) >= max_entries) {
+                  if (final) {
+                    new_remaining(update) = u;
+                  }
+                  update++;
+                }
+              });
+
+          remaining = new_remaining;
+        }
+      }
+      gc_nedges = 0;
+      Kokkos::parallel_reduce(
+          policy_t(0, n),
+          KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) {
+            sum += edges_per_source(i);
+          },
+          gc_nedges);
+      if (!scal_eq_ord && !is_host_space) {
+        Kokkos::deep_copy(wgt_by_source, wgt_out);
+      }
+    }
+  }
+
+  struct translationFunctor {
+    matrix_t vcmap, g;
+    vtx_view_t mapped_edges, edges_per_source;
+    edge_view_t source_bucket_offset;
+    vtx_view_t edges_out;
+    wgt_view_t wgts_out;
+    ordinal_t workLength;
+
+    translationFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges,
+                       vtx_view_t _edges_per_source,
+                       edge_view_t _source_bucket_offset, vtx_view_t _edges_out,
+                       wgt_view_t _wgts_out)
+        : vcmap(_vcmap),
+          g(_g),
+          mapped_edges(_mapped_edges),
+          edges_per_source(_edges_per_source),
+          source_bucket_offset(_source_bucket_offset),
+          edges_out(_edges_out),
+          wgts_out(_wgts_out),
+          workLength(_g.numRows()) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member& t) const {
+      ordinal_t i = t.league_rank() * t.team_size() + t.team_rank();
+      if (i >= workLength) return;
+      ordinal_t u         = vcmap.graph.entries(i);
+      edge_offset_t start = g.graph.row_map(i);
+      edge_offset_t end   = g.graph.row_map(i + 1);
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, start, end),
+                           [=](const edge_offset_t idx) {
+                             ordinal_t v = mapped_edges(idx);
+                             if (u != v) {
+                               // fix this, inefficient
+                               edge_offset_t offset = Kokkos::atomic_fetch_add(
+                                   &edges_per_source(u), 1);
+
+                               offset += source_bucket_offset(u);
+
+                               edges_out(offset) = v;
+                               wgts_out(offset)  = g.values(idx);
+                             }
+                           });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ordinal_t& i) const {
+      ordinal_t u         = vcmap.graph.entries(i);
+      edge_offset_t start = g.graph.row_map(i);
+      edge_offset_t end   = g.graph.row_map(i + 1);
+      for (edge_offset_t idx = start; idx < end; idx++) {
+        ordinal_t v = mapped_edges(idx);
+        if (u != v) {
+          // fix this
+          edge_offset_t offset =
+              Kokkos::atomic_fetch_add(&edges_per_source(u), 1);
+
+          offset += source_bucket_offset(u);
+
+          edges_out(offset) = v;
+          wgts_out(offset)  = g.values(idx);
+        }
+      }
+    }
+  };
+
+  // optimized for regular distribution low degree rows
+  static coarse_level_triple build_nonskew(coarsen_handle& handle,
+                                           const matrix_t g,
+                                           const matrix_t vcmap,
+                                           vtx_view_t mapped_edges,
+                                           vtx_view_t edges_per_source) {
+    ordinal_t n  = g.numRows();
+    ordinal_t nc = vcmap.numCols();
+    edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1);
+    edge_offset_t gc_nedges = 0;
+
+    Kokkos::parallel_scan("calc source offsets", policy_t(0, nc),
+                          prefix_sum(edges_per_source, source_bucket_offset));
+
+    Kokkos::deep_copy(edges_per_source, static_cast<ordinal_t>(0));
+
+    edge_subview_t sbo_subview   = Kokkos::subview(source_bucket_offset, nc);
+    edge_offset_t nnz_pre_dedupe = 0;
+    Kokkos::deep_copy(nnz_pre_dedupe, sbo_subview);
+
+    vtx_view_t dest_by_source("dest_by_source", nnz_pre_dedupe);
+    wgt_view_t wgt_by_source("wgt_by_source", nnz_pre_dedupe);
+
+    // translates fine entries into coarse entries and writes into coarse rows
+    translationFunctor translateF(vcmap, g, mapped_edges, edges_per_source,
+                                  source_bucket_offset, dest_by_source,
+                                  wgt_by_source);
+    if (is_host_space) {
+      bool use_dyn =
+          should_use_dyn(n, g.graph.row_map, exec_space::concurrency());
+      if (use_dyn) {
+        Kokkos::parallel_for("move edges to coarse matrix", dyn_policy_t(0, n),
+                             translateF);
+      } else {
+        Kokkos::parallel_for("move edges to coarse matrix", policy_t(0, n),
+                             translateF);
+      }
+    } else {
+      auto execSpaceEnum =
+          KokkosKernels::Impl::kk_get_exec_space_type<exec_space>();
+      int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(
+          n, g.nnz(), execSpaceEnum);
+      team_policy_t dummy(1, 1, vectorLength);
+      int teamSize = dummy.team_size_max(translateF, Kokkos::ParallelForTag());
+      Kokkos::parallel_for(
+          "move edges to coarse matrix",
+          team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength),
+          translateF);
+    }
+
+    deduplicate_graph(handle, nc, false, edges_per_source, dest_by_source,
+                      wgt_by_source, source_bucket_offset, gc_nedges);
+
+    edge_view_t source_offsets("source_offsets", nc + 1);
+
+    Kokkos::parallel_scan("calc source offsets again", policy_t(0, nc),
+                          prefix_sum(edges_per_source, source_offsets));
+
+    edge_subview_t edge_total_subview = Kokkos::subview(source_offsets, nc);
+    Kokkos::deep_copy(gc_nedges, edge_total_subview);
+
+    vtx_view_t dest_idx("dest_idx", gc_nedges);
+    wgt_view_t wgts("wgts", gc_nedges);
+
+    if (is_host_space) {
+      bool use_dyn =
+          should_use_dyn(nc, source_offsets, exec_space::concurrency());
+      if (use_dyn) {
+        Kokkos::parallel_for(
+            "move deduped edges to new coarse matrix", dyn_policy_t(0, nc),
+            KOKKOS_LAMBDA(const ordinal_t& u) {
+              edge_offset_t start_origin = source_bucket_offset(u);
+              edge_offset_t start_dest   = source_offsets(u);
+              for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) {
+                dest_idx(start_dest + idx) = dest_by_source(start_origin + idx);
+                wgts(start_dest + idx)     = wgt_by_source(start_origin + idx);
+              }
+            });
+      } else {
+        Kokkos::parallel_for(
+            "move deduped edges to new coarse matrix", policy_t(0, nc),
+            KOKKOS_LAMBDA(const ordinal_t& u) {
+              edge_offset_t start_origin = source_bucket_offset(u);
+              edge_offset_t start_dest   = source_offsets(u);
+              for (ordinal_t idx = 0; idx < edges_per_source(u); idx++) {
+                dest_idx(start_dest + idx) = dest_by_source(start_origin + idx);
+                wgts(start_dest + idx)     = wgt_by_source(start_origin + idx);
+              }
+            });
+      }
+    } else {
+      Kokkos::parallel_for(
+          "move deduped edges to new coarse matrix",
+          team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) {
+            ordinal_t u                = thread.league_rank();
+            edge_offset_t start_origin = source_bucket_offset(u);
+            edge_offset_t start_dest   = source_offsets(u);
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(thread, edges_per_source(u)),
+                [=](const ordinal_t idx) {
+                  dest_idx(start_dest + idx) =
+                      dest_by_source(start_origin + idx);
+                  wgts(start_dest + idx) = wgt_by_source(start_origin + idx);
+                });
+          });
+    }
+
+    graph_type gc_graph(dest_idx, source_offsets);
+    matrix_t gc("gc", nc, wgts, gc_graph);
+
+    coarse_level_triple next_level;
+    next_level.mtx = gc;
+    return next_level;
+  }
+
+  // forms the explicit matrix created by symmetrizing the implicit matrix
+  static matrix_t collapse_directed_to_undirected(
+      const ordinal_t nc, const vtx_view_t source_edge_counts,
+      const edge_view_t source_row_map, const vtx_view_t source_destinations,
+      const wgt_view_t source_wgts) {
+    vtx_view_t coarse_degree("coarse degree", nc);
+    Kokkos::deep_copy(coarse_degree, source_edge_counts);
+
+    Kokkos::parallel_for(
+        "count directed edges owned by opposite endpoint",
+        team_policy_t(nc, Kokkos::AUTO), KOKKOS_LAMBDA(const member& thread) {
+          ordinal_t u         = thread.league_rank();
+          edge_offset_t start = source_row_map(u);
+          edge_offset_t end   = start + source_edge_counts(u);
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, start, end),
+                               [=](const edge_offset_t idx) {
+                                 ordinal_t v = source_destinations(idx);
+                                 // increment other vertex
+                                 Kokkos::atomic_fetch_add(&coarse_degree(v), 1);
+                               });
+        });
+
+    edge_view_t target_row_map("target row map", nc + 1);
+
+    Kokkos::parallel_scan("calc target row map", policy_t(0, nc),
+                          prefix_sum(coarse_degree, target_row_map));
+
+    Kokkos::deep_copy(coarse_degree, static_cast<ordinal_t>(0));
+
+    edge_offset_t coarse_edges_total = 0;
+    edge_subview_t coarse_edge_total_subview =
+        Kokkos::subview(target_row_map, nc);
+    Kokkos::deep_copy(coarse_edges_total, coarse_edge_total_subview);
+
+    vtx_view_t dest_idx("dest_idx", coarse_edges_total);
+    wgt_view_t wgts("wgts", coarse_edges_total);
+
+    Kokkos::parallel_for(
+        "move edges into correct size matrix", team_policy_t(nc, Kokkos::AUTO),
+        functorCollapseDirectedToUndirected(
+            source_row_map, target_row_map, source_edge_counts, coarse_degree,
+            source_destinations, dest_idx, source_wgts, wgts));
+
+    graph_type gc_graph(dest_idx, target_row_map);
+    matrix_t gc("gc", nc, wgts, gc_graph);
+    return gc;
+  }
+
+  // optimized for skewed degree distributions
+  static coarse_level_triple build_skew(coarsen_handle& handle,
+                                        const matrix_t g, const matrix_t vcmap,
+                                        vtx_view_t mapped_edges,
+                                        vtx_view_t degree_initial) {
+    ordinal_t n             = g.numRows();
+    ordinal_t nc            = vcmap.numCols();
+    edge_offset_t gc_nedges = 0;
+
+    vtx_view_t edges_per_source("edges_per_source", nc);
+
+    // recount with edges only belonging to coarse vertex of smaller degree
+    // matrix becomes directed
+    Kokkos::parallel_for(
+        "recount edges", team_policy_t(n, Kokkos::AUTO),
+        KOKKOS_LAMBDA(const member& thread) {
+          ordinal_t outer_idx         = thread.league_rank();
+          ordinal_t u                 = vcmap.graph.entries(outer_idx);
+          edge_offset_t start         = g.graph.row_map(outer_idx);
+          edge_offset_t end           = g.graph.row_map(outer_idx + 1);
+          ordinal_t nonLoopEdgesTotal = 0;
+          Kokkos::parallel_reduce(
+              Kokkos::TeamThreadRange(thread, start, end),
+              [=](const edge_offset_t idx, ordinal_t& local_sum) {
+                ordinal_t v       = mapped_edges(idx);
+                bool degree_less  = degree_initial(u) < degree_initial(v);
+                bool degree_equal = degree_initial(u) == degree_initial(v);
+                if (u != v && (degree_less || (degree_equal && u < v))) {
+                  local_sum++;
+                }
+              },
+              nonLoopEdgesTotal);
+          Kokkos::single(Kokkos::PerTeam(thread), [=]() {
+            Kokkos::atomic_add(&edges_per_source(u), nonLoopEdgesTotal);
+          });
+        });
+
+    edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1);
+
+    Kokkos::parallel_scan("calc source offsets", policy_t(0, nc),
+                          prefix_sum(edges_per_source, source_bucket_offset));
+    edge_subview_t sbo_subview   = Kokkos::subview(source_bucket_offset, nc);
+    edge_offset_t nnz_pre_dedupe = 0;
+    Kokkos::deep_copy(nnz_pre_dedupe, sbo_subview);
+
+    Kokkos::deep_copy(edges_per_source, static_cast<ordinal_t>(0));
+    vtx_view_t dest_by_source("dest by source", nnz_pre_dedupe);
+    wgt_view_t wgt_by_source("wgt by source", nnz_pre_dedupe);
+    Kokkos::parallel_for(
+        "combine fine rows", team_policy_t(n, Kokkos::AUTO),
+        KOKKOS_LAMBDA(const member& thread) {
+          ordinal_t outer_idx = thread.league_rank();
+          ordinal_t u         = vcmap.graph.entries(outer_idx);
+          edge_offset_t start = g.graph.row_map(outer_idx);
+          edge_offset_t end   = g.graph.row_map(outer_idx + 1);
+          Kokkos::parallel_for(
+              Kokkos::TeamThreadRange(thread, start, end),
+              [=](const edge_offset_t idx) {
+                ordinal_t v       = mapped_edges(idx);
+                bool degree_less  = degree_initial(u) < degree_initial(v);
+                bool degree_equal = degree_initial(u) == degree_initial(v);
+                if (degree_less || (degree_equal && u < v)) {
+                  edge_offset_t offset =
+                      Kokkos::atomic_fetch_add(&edges_per_source(u), 1);
+
+                  offset += source_bucket_offset(u);
+
+                  dest_by_source(offset) = v;
+                  wgt_by_source(offset)  = g.values(idx);
+                }
+              });
+        });
+    gc_nedges = 0;
+
+    deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source,
+                      wgt_by_source, source_bucket_offset, gc_nedges);
+
+    // form the final coarse graph, which requires symmetrizing the matrix
+    matrix_t gc = collapse_directed_to_undirected(
+        nc, edges_per_source, source_bucket_offset, dest_by_source,
+        wgt_by_source);
+
+    coarse_level_triple next_level;
+    next_level.mtx = gc;
+    return next_level;
+  }
+
+  // optimized for very large row sizes caused by lots of duplicate entries
+  // first translates each fine entry to its coarse vertex label
+  // deduplicates within each fine row
+  // combines fine rows into coarse rows
+  // deduplicates within each coarse row
+  static coarse_level_triple build_high_duplicity(coarsen_handle& handle,
+                                                  const matrix_t g,
+                                                  const matrix_t vcmap,
+                                                  vtx_view_t mapped_edges,
+                                                  vtx_view_t degree_initial) {
+    ordinal_t n             = g.numRows();
+    ordinal_t nc            = vcmap.numCols();
+    edge_offset_t gc_nedges = 0;
+
+    vtx_view_t dedupe_count("dedupe count", n);
+    edge_view_t row_map_copy("row map copy", n + 1);
+
+    // recount fine row sizes with edges only belonging to fine vertex of coarse
+    // vertex of smaller degree matrix becomes directed
+    Kokkos::parallel_for(
+        "recount edges", team_policy_t(n, Kokkos::AUTO),
+        KOKKOS_LAMBDA(const member& thread) {
+          ordinal_t outer_idx         = thread.league_rank();
+          ordinal_t u                 = vcmap.graph.entries(outer_idx);
+          edge_offset_t start         = g.graph.row_map(outer_idx);
+          edge_offset_t end           = g.graph.row_map(outer_idx + 1);
+          ordinal_t nonLoopEdgesTotal = 0;
+          Kokkos::parallel_reduce(
+              Kokkos::TeamThreadRange(thread, start, end),
+              [=](const edge_offset_t idx, ordinal_t& local_sum) {
+                ordinal_t v       = mapped_edges(idx);
+                bool degree_less  = degree_initial(u) < degree_initial(v);
+                bool degree_equal = degree_initial(u) == degree_initial(v);
+                if (u != v && (degree_less || (degree_equal && u < v))) {
+                  local_sum++;
+                }
+              },
+              nonLoopEdgesTotal);
+          Kokkos::single(Kokkos::PerTeam(thread), [=]() {
+            dedupe_count(outer_idx) = nonLoopEdgesTotal;
+          });
+        });
+
+    Kokkos::parallel_scan("calc source offsets", policy_t(0, n),
+                          prefix_sum(dedupe_count, row_map_copy));
+    // reset counters to 0
+    Kokkos::deep_copy(dedupe_count, static_cast<ordinal_t>(0));
+
+    edge_subview_t fine_recount_subview = Kokkos::subview(row_map_copy, n);
+    edge_offset_t fine_recount          = 0;
+    Kokkos::deep_copy(fine_recount, fine_recount_subview);
+
+    vtx_view_t dest_fine("fine to coarse dests", fine_recount);
+    wgt_view_t wgt_fine("fine to coarse wgts", fine_recount);
+
+    // create a new directed version of the fine matrix
+    Kokkos::parallel_for(
+        "move edges to new matrix", team_policy_t(n, Kokkos::AUTO),
+        KOKKOS_LAMBDA(const member& thread) {
+          ordinal_t outer_idx = thread.league_rank();
+          ordinal_t u         = vcmap.graph.entries(outer_idx);
+          edge_offset_t start = g.graph.row_map(outer_idx);
+          edge_offset_t end   = g.graph.row_map(outer_idx + 1);
+          Kokkos::parallel_for(
+              Kokkos::TeamThreadRange(thread, start, end),
+              [=](const edge_offset_t idx) {
+                ordinal_t v       = mapped_edges(idx);
+                bool degree_less  = degree_initial(u) < degree_initial(v);
+                bool degree_equal = degree_initial(u) == degree_initial(v);
+                if (u != v && (degree_less || (degree_equal && u < v))) {
+                  edge_offset_t offset =
+                      Kokkos::atomic_fetch_add(&dedupe_count(outer_idx), 1);
+
+                  offset += row_map_copy(outer_idx);
+
+                  dest_fine(offset) = v;
+                  wgt_fine(offset)  = g.values(idx);
+                }
+              });
+        });
+    //"delete" these views
+    Kokkos::resize(mapped_edges, 0);
+
+    // deduplicate coarse adjacencies within each fine row
+    deduplicate_graph(handle, n, true, dedupe_count, dest_fine, wgt_fine,
+                      row_map_copy, gc_nedges);
+
+    edge_view_t source_bucket_offset("source_bucket_offsets", nc + 1);
+    vtx_view_t edges_per_source("edges_per_source", nc);
+
+    Kokkos::parallel_for(
+        "sum fine row sizes", policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) {
+          ordinal_t u = vcmap.graph.entries(i);
+          Kokkos::atomic_fetch_add(&edges_per_source(u), dedupe_count(i));
+        });
+    Kokkos::parallel_scan("calc source offsets", policy_t(0, nc),
+                          prefix_sum(edges_per_source, source_bucket_offset));
+    Kokkos::deep_copy(edges_per_source, static_cast<ordinal_t>(0));
+    vtx_view_t dest_by_source("dest by source", gc_nedges);
+    wgt_view_t wgt_by_source("wgt by source", gc_nedges);
+    Kokkos::parallel_for(
+        "combine deduped fine rows", team_policy_t(n, Kokkos::AUTO),
+        KOKKOS_LAMBDA(const member& thread) {
+          ordinal_t outer_idx = thread.league_rank();
+          ordinal_t u         = vcmap.graph.entries(outer_idx);
+          edge_offset_t start = row_map_copy(outer_idx);
+          edge_offset_t end   = start + dedupe_count(outer_idx);
+          Kokkos::parallel_for(
+              Kokkos::TeamThreadRange(thread, start, end),
+              [=](const edge_offset_t idx) {
+                ordinal_t v       = dest_fine(idx);
+                bool degree_less  = degree_initial(u) < degree_initial(v);
+                bool degree_equal = degree_initial(u) == degree_initial(v);
+                if (degree_less || (degree_equal && u < v)) {
+                  edge_offset_t offset =
+                      Kokkos::atomic_fetch_add(&edges_per_source(u), 1);
+
+                  offset += source_bucket_offset(u);
+
+                  dest_by_source(offset) = v;
+                  wgt_by_source(offset)  = wgt_fine(idx);
+                }
+              });
+        });
+    gc_nedges = 0;
+    Kokkos::resize(dest_fine, 0);
+    Kokkos::resize(wgt_fine, 0);
+
+    deduplicate_graph(handle, nc, true, edges_per_source, dest_by_source,
+                      wgt_by_source, source_bucket_offset, gc_nedges);
+
+    // form the final coarse graph, which requires symmetrizing the matrix
+    matrix_t gc = collapse_directed_to_undirected(
+        nc, edges_per_source, source_bucket_offset, dest_by_source,
+        wgt_by_source);
+
+    coarse_level_triple next_level;
+    next_level.mtx = gc;
+    return next_level;
+  }
+
+  struct countingFunctor {
+    // counts adjancies for each coarse vertex
+    // also calculates coarse vertex wgts
+    matrix_t vcmap, g;
+    vtx_view_t mapped_edges, degree_initial;
+    vtx_view_t c_vtx_w, f_vtx_w;
+    ordinal_t workLength;
+
+    countingFunctor(matrix_t _vcmap, matrix_t _g, vtx_view_t _mapped_edges,
+                    vtx_view_t _degree_initial, vtx_view_t _c_vtx_w,
+                    vtx_view_t _f_vtx_w)
+        : vcmap(_vcmap),
+          g(_g),
+          mapped_edges(_mapped_edges),
+          degree_initial(_degree_initial),
+          c_vtx_w(_c_vtx_w),
+          f_vtx_w(_f_vtx_w),
+          workLength(_g.numRows()) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member& t) const {
+      ordinal_t i = t.league_rank() * t.team_size() + t.team_rank();
+      if (i >= workLength) return;
+      ordinal_t u                 = vcmap.graph.entries(i);
+      edge_offset_t start         = g.graph.row_map(i);
+      edge_offset_t end           = g.graph.row_map(i + 1);
+      ordinal_t nonLoopEdgesTotal = 0;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(t, start, end),
+          [=](const edge_offset_t idx, ordinal_t& local_sum) {
+            ordinal_t v       = vcmap.graph.entries(g.graph.entries(idx));
+            mapped_edges(idx) = v;
+            if (u != v) {
+              local_sum++;
+            }
+          },
+          nonLoopEdgesTotal);
+      Kokkos::single(Kokkos::PerThread(t), [=]() {
+        Kokkos::atomic_add(&degree_initial(u), nonLoopEdgesTotal);
+        Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i));
+      });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ordinal_t& i) const {
+      ordinal_t u                 = vcmap.graph.entries(i);
+      edge_offset_t start         = g.graph.row_map(i);
+      edge_offset_t end           = g.graph.row_map(i + 1);
+      ordinal_t nonLoopEdgesTotal = 0;
+      for (edge_offset_t idx = start; idx < end; idx++) {
+        ordinal_t v       = vcmap.graph.entries(g.graph.entries(idx));
+        mapped_edges(idx) = v;
+        if (u != v) {
+          nonLoopEdgesTotal++;
+        }
+      }
+      Kokkos::atomic_add(&degree_initial(u), nonLoopEdgesTotal);
+      Kokkos::atomic_add(&c_vtx_w(u), f_vtx_w(i));
+    }
+  };
+
+  static coarse_level_triple build_coarse_graph(coarsen_handle& handle,
+                                                const coarse_level_triple level,
+                                                const matrix_t vcmap) {
+    if (handle.b == Spgemm || handle.b == Spgemm_transpose_first) {
+      return build_coarse_graph_spgemm(handle, level, vcmap);
+    }
+
+    matrix_t g   = level.mtx;
+    ordinal_t n  = g.numRows();
+    ordinal_t nc = vcmap.numCols();
+
+    vtx_view_t mapped_edges("mapped edges", g.nnz());
+
+    vtx_view_t degree_initial("edges_per_source", nc);
+    vtx_view_t f_vtx_w = level.vtx_wgts;
+    vtx_view_t c_vtx_w = vtx_view_t("coarse vertex weights", nc);
+
+    // count non-self loop edges per coarse vertex
+    // also computes coarse vertex weights
+    countingFunctor countF(vcmap, g, mapped_edges, degree_initial, c_vtx_w,
+                           f_vtx_w);
+    if (is_host_space) {
+      Kokkos::parallel_for(
+          "count edges per coarse vertex (also compute coarse vertex weights)",
+          policy_t(0, n), countF);
+    } else {
+      auto execSpaceEnum =
+          KokkosKernels::Impl::kk_get_exec_space_type<exec_space>();
+      int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(
+          n, g.nnz(), execSpaceEnum);
+      team_policy_t dummy(1, 1, vectorLength);
+      int teamSize = dummy.team_size_max(countF, Kokkos::ParallelForTag());
+      // count edges per vertex
+      Kokkos::parallel_for(
+          "count edges per coarse vertex (also compute coarse vertex weights)",
+          team_policy_t((n + teamSize - 1) / teamSize, teamSize, vectorLength),
+          countF);
+    }
+
+    // compute max row size and avg row size
+    // use this to determine most efficient method for building coarse graph
+    // (for load balance primarily)
+    edge_offset_t total_unduped = 0;
+    ordinal_t max_unduped       = 0;
+    Kokkos::parallel_reduce(
+        "find max", policy_t(0, nc),
+        KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& l_max) {
+          if (l_max <= degree_initial(i)) {
+            l_max = degree_initial(i);
+          }
+        },
+        Kokkos::Max<ordinal_t, Kokkos::HostSpace>(max_unduped));
+    Kokkos::parallel_reduce(
+        "find total", policy_t(0, nc),
+        KOKKOS_LAMBDA(const ordinal_t i, edge_offset_t& sum) {
+          sum += degree_initial(i);
+        },
+        total_unduped);
+    ordinal_t avg_unduped = total_unduped / nc;
+
+    coarse_level_triple next_level;
+    // optimized subroutines for sufficiently irregular graphs or high average
+    // adjacency rows don't do optimizations if running on CPU (the default host
+    // space)
+    if (avg_unduped > (nc / 4) && !is_host_space) {
+      next_level =
+          build_high_duplicity(handle, g, vcmap, mapped_edges, degree_initial);
+    } else if (avg_unduped > 50 && (max_unduped / 10) > avg_unduped &&
+               !is_host_space) {
+      next_level = build_skew(handle, g, vcmap, mapped_edges, degree_initial);
+    } else {
+      next_level =
+          build_nonskew(handle, g, vcmap, mapped_edges, degree_initial);
+    }
+
+    next_level.vtx_wgts        = c_vtx_w;
+    next_level.level           = level.level + 1;
+    next_level.interp_mtx      = vcmap;
+    next_level.uniform_weights = false;
+    return next_level;
+  }
+
+  static matrix_t generate_coarse_mapping(coarsen_handle& handle,
+                                          const matrix_t g,
+                                          bool uniform_weights) {
+    matrix_t interpolation_graph;
+    int choice = 0;
+
+    switch (handle.h) {
+      case Match: choice = 0; break;
+      case MtMetis: choice = 1; break;
+      default: choice = 0;
+    }
+
+    switch (handle.h) {
+      case HECv1:
+        interpolation_graph = mapper_t::coarsen_HEC(g, uniform_weights);
+        break;
+      case Match:
+      case MtMetis:
+        interpolation_graph =
+            mapper_t::coarsen_match(g, uniform_weights, choice);
+        break;
+      case MIS2: interpolation_graph = mapper_t::coarsen_mis_2(g); break;
+      case GOSHv2: interpolation_graph = mapper_t::coarsen_GOSH_v2(g); break;
+      case GOSHv1: interpolation_graph = mapper_t::coarsen_GOSH(g); break;
+    }
+    return interpolation_graph;
+  }
+
+  // we can support weighted vertices pretty easily, but we don't rn
+  // this function can't return the generated list directly because of an NVCC
+  // compiler bug caller must use the get_levels() method after calling this
+  // function
+  static void generate_coarse_graphs(coarsen_handle& handle,
+                                     const matrix_t fine_g,
+                                     bool uniform_weights = false) {
+    ordinal_t fine_n                       = fine_g.numRows();
+    std::list<coarse_level_triple>& levels = handle.results;
+    levels.clear();
+    coarse_level_triple finest;
+    finest.mtx = fine_g;
+    // 1-indexed, not zero indexed
+    finest.level           = 1;
+    finest.uniform_weights = uniform_weights;
+    vtx_view_t vtx_weights("vertex weights", fine_n);
+    Kokkos::deep_copy(vtx_weights, static_cast<ordinal_t>(1));
+    finest.vtx_wgts = vtx_weights;
+    levels.push_back(finest);
+    while (levels.rbegin()->mtx.numRows() > handle.coarse_vtx_cutoff) {
+      coarse_level_triple current_level = *levels.rbegin();
+
+      matrix_t interp_graph = generate_coarse_mapping(
+          handle, current_level.mtx, current_level.uniform_weights);
+
+      if (interp_graph.numCols() < handle.min_allowed_vtx) {
+        break;
+      }
+
+      coarse_level_triple next_level =
+          build_coarse_graph(handle, current_level, interp_graph);
+
+      levels.push_back(next_level);
+
+      if (levels.size() > handle.max_levels) break;
+    }
+  }
+};
+
+}  // end namespace Experimental
+}  // end namespace KokkosGraph
+// exclude from Cuda builds without lambdas enabled
+#endif
diff --git a/src/graph/KokkosGraph_CoarsenHeuristics.hpp b/src/graph/KokkosGraph_CoarsenHeuristics.hpp
new file mode 100644
index 0000000000..a9a5ce3e21
--- /dev/null
+++ b/src/graph/KokkosGraph_CoarsenHeuristics.hpp
@@ -0,0 +1,1191 @@
+#pragma once
+// exclude from Cuda builds without lambdas enabled
+#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#include <limits>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_Sort.hpp>
+#include <Kokkos_Functional.hpp>
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosGraph_MIS2.hpp"
+
+namespace KokkosGraph {
+
+namespace Experimental {
+
+template <class crsMat>
+class coarsen_heuristics {
+ public:
+  // define internal types
+  using matrix_t       = crsMat;
+  using exec_space     = typename matrix_t::execution_space;
+  using mem_space      = typename matrix_t::memory_space;
+  using Device         = typename matrix_t::device_type;
+  using ordinal_t      = typename matrix_t::ordinal_type;
+  using edge_offset_t  = typename matrix_t::size_type;
+  using scalar_t       = typename matrix_t::value_type;
+  using vtx_view_t     = typename Kokkos::View<ordinal_t*, Device>;
+  using wgt_view_t     = typename Kokkos::View<scalar_t*, Device>;
+  using edge_view_t    = typename Kokkos::View<edge_offset_t*, Device>;
+  using edge_subview_t = typename Kokkos::View<edge_offset_t, Device>;
+  using rand_view_t    = typename Kokkos::View<uint64_t*, Device>;
+  using graph_type     = typename matrix_t::staticcrsgraph_type;
+  using policy_t       = typename Kokkos::RangePolicy<exec_space>;
+  using team_policy_t  = typename Kokkos::TeamPolicy<exec_space>;
+  using member         = typename team_policy_t::member_type;
+  using part_view_t    = typename Kokkos::View<int*, Device>;
+  using pool_t         = Kokkos::Random_XorShift64_Pool<Device>;
+  using gen_t          = typename pool_t::generator_type;
+  using hasher_t       = Kokkos::pod_hash<ordinal_t>;
+  static constexpr ordinal_t get_null_val() {
+    if (std::is_signed<ordinal_t>::value) {
+      return -1;
+    } else {
+      return std::numeric_limits<ordinal_t>::max();
+    }
+  }
+  static constexpr ordinal_t ORD_MAX = get_null_val();
+
+  static vtx_view_t generate_permutation(ordinal_t n, pool_t rand_pool) {
+    rand_view_t randoms("randoms", n);
+
+    Kokkos::parallel_for(
+        "create random entries", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          gen_t generator = rand_pool.get_state();
+          randoms(i)      = generator.urand64();
+          rand_pool.free_state(generator);
+        });
+
+    int t_buckets = 2 * n;
+    vtx_view_t buckets("buckets", t_buckets);
+    Kokkos::parallel_for(
+        "init buckets", policy_t(0, t_buckets),
+        KOKKOS_LAMBDA(ordinal_t i) { buckets(i) = ORD_MAX; });
+
+    uint64_t max         = std::numeric_limits<uint64_t>::max();
+    uint64_t bucket_size = max / t_buckets;
+    Kokkos::parallel_for(
+        "insert buckets", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          ordinal_t bucket = randoms(i) / bucket_size;
+          // find the next open bucket
+          for (;; bucket++) {
+            if (bucket >= t_buckets) bucket -= t_buckets;
+            if (buckets(bucket) == ORD_MAX) {
+              // attempt to insert into bucket
+              if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket),
+                                                         ORD_MAX, i)) {
+                break;
+              }
+            }
+          }
+        });
+
+    vtx_view_t permute("permutation", n);
+    Kokkos::parallel_scan(
+        "extract permutation", policy_t(0, t_buckets),
+        KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) {
+          if (buckets(i) != ORD_MAX) {
+            if (final) {
+              permute(update) = buckets(i);
+            }
+            update++;
+          }
+        });
+
+    return permute;
+  }
+
+  // create a mapping when some vertices are already mapped
+  // hn is a list of vertices such that vertex i wants to aggregate with vertex
+  // hn(i)
+  static ordinal_t parallel_map_construct_prefilled(
+      vtx_view_t vcmap, const ordinal_t n, const vtx_view_t vperm,
+      const vtx_view_t hn, Kokkos::View<ordinal_t, Device> nvertices_coarse) {
+    vtx_view_t match("match", n);
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (vcmap(i) == ORD_MAX) {
+            match(i) = ORD_MAX;
+          } else {
+            match(i) = n + 1;
+          }
+        });
+    ordinal_t perm_length = vperm.extent(0);
+
+    // construct mapping using heaviest edges
+    int swap             = 1;
+    vtx_view_t curr_perm = vperm;
+    while (perm_length > 0) {
+      vtx_view_t next_perm("next perm", perm_length);
+      Kokkos::View<ordinal_t, Device> next_length("next_length");
+
+      Kokkos::parallel_for(
+          policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) {
+            ordinal_t u   = curr_perm(i);
+            ordinal_t v   = hn(u);
+            int condition = u < v;
+            // need to enforce an ordering condition to allow hard-stall
+            // conditions to be broken
+            if (condition ^ swap) {
+              if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX,
+                                                         v)) {
+                if (u == v || Kokkos::atomic_compare_exchange_strong(
+                                  &match(v), ORD_MAX, u)) {
+                  ordinal_t cv =
+                      Kokkos::atomic_fetch_add(&nvertices_coarse(), 1);
+                  vcmap(u) = cv;
+                  vcmap(v) = cv;
+                } else {
+                  if (vcmap(v) != ORD_MAX) {
+                    vcmap(u) = vcmap(v);
+                  } else {
+                    match(u) = ORD_MAX;
+                  }
+                }
+              }
+            }
+          });
+      Kokkos::fence();
+      // add the ones that failed to be reprocessed next round
+      // maybe count these then create next_perm to save memory?
+      Kokkos::parallel_for(
+          policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) {
+            ordinal_t u = curr_perm(i);
+            if (vcmap(u) == ORD_MAX) {
+              ordinal_t add_next  = Kokkos::atomic_fetch_add(&next_length(), 1);
+              next_perm(add_next) = u;
+            }
+          });
+      Kokkos::fence();
+      swap = swap ^ 1;
+      Kokkos::deep_copy(perm_length, next_length);
+      curr_perm = next_perm;
+    }
+    ordinal_t nc = 0;
+    Kokkos::deep_copy(nc, nvertices_coarse);
+    return nc;
+  }
+
+  // hn is a list of vertices such that vertex i wants to aggregate with vertex
+  // hn(i)
+  static ordinal_t parallel_map_construct(vtx_view_t vcmap, const ordinal_t n,
+                                          const vtx_view_t vperm,
+                                          const vtx_view_t hn,
+                                          const vtx_view_t ordering) {
+    vtx_view_t match("match", n);
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { match(i) = ORD_MAX; });
+    ordinal_t perm_length = n;
+    Kokkos::View<ordinal_t, Device> nvertices_coarse("nvertices");
+
+    // construct mapping using heaviest edges
+    int swap             = 1;
+    vtx_view_t curr_perm = vperm;
+    while (perm_length > 0) {
+      vtx_view_t next_perm("next perm", perm_length);
+      Kokkos::View<ordinal_t, Device> next_length("next_length");
+
+      Kokkos::parallel_for(
+          policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) {
+            ordinal_t u   = curr_perm(i);
+            ordinal_t v   = hn(u);
+            int condition = ordering(u) < ordering(v);
+            // need to enforce an ordering condition to allow hard-stall
+            // conditions to be broken
+            if (condition ^ swap) {
+              if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX,
+                                                         v)) {
+                if (u == v || Kokkos::atomic_compare_exchange_strong(
+                                  &match(v), ORD_MAX, u)) {
+                  ordinal_t cv = u;
+                  if (v < u) {
+                    cv = v;
+                  }
+                  vcmap(u) = cv;
+                  vcmap(v) = cv;
+                } else {
+                  if (vcmap(v) != ORD_MAX) {
+                    vcmap(u) = vcmap(v);
+                  } else {
+                    match(u) = ORD_MAX;
+                  }
+                }
+              }
+            }
+          });
+      Kokkos::fence();
+      // add the ones that failed to be reprocessed next round
+      // maybe count these then create next_perm to save memory?
+      Kokkos::parallel_scan(
+          policy_t(0, perm_length),
+          KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update,
+                        const bool final) {
+            ordinal_t u = curr_perm(i);
+            if (vcmap(u) == ORD_MAX) {
+              if (final) {
+                next_perm(update) = u;
+              }
+              update++;
+            }
+            if (final && (i + 1) == perm_length) {
+              next_length() = update;
+            }
+          });
+      Kokkos::fence();
+      swap = swap ^ 1;
+      Kokkos::deep_copy(perm_length, next_length);
+      curr_perm = next_perm;
+    }
+    Kokkos::parallel_scan(
+        "assign aggregates", policy_t(0, n),
+        KOKKOS_LAMBDA(const ordinal_t u, ordinal_t& update, const bool final) {
+          if (vcmap(u) == u) {
+            if (final) {
+              vcmap(u) = update;
+            }
+            update++;
+          } else if (final) {
+            vcmap(u) = vcmap(u) + n;
+          }
+          if (final && (u + 1) == n) {
+            nvertices_coarse() = update;
+          }
+        });
+    Kokkos::parallel_for(
+        "propagate aggregates", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) {
+          if (vcmap(u) >= n) {
+            ordinal_t c_id = vcmap(u) - n;
+            vcmap(u)       = vcmap(c_id);
+          }
+        });
+    ordinal_t nc = 0;
+    Kokkos::deep_copy(nc, nvertices_coarse);
+    return nc;
+  }
+
+  static part_view_t GOSH_clusters(const matrix_t& g) {
+    // finds the central vertices for GOSH clusters
+    // approximately this is a maximal independent set (if you pretend edges
+    // whose endpoints both exceed degree thresholds don't exist) IS vertices
+    // are preferred to be vertices with high degree, so it should be small
+
+    ordinal_t n = g.numRows();
+
+    // 0: unassigned
+    // 1: in IS
+    //-1: adjacent to an IS vertex
+    part_view_t state("psuedo is membership", n);
+
+    ordinal_t unassigned_total = n;
+
+    // gonna keep this as an edge view in case we wanna do weighted degree
+    edge_view_t degrees("degrees", n);
+    vtx_view_t unassigned("unassigned vertices", n);
+    Kokkos::parallel_for(
+        "populate degrees", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          degrees(i)    = g.graph.row_map(i + 1) - g.graph.row_map(i);
+          unassigned(i) = i;
+        });
+    edge_offset_t threshold = g.nnz() / g.numRows();
+
+    while (unassigned_total > 0) {
+      part_view_t tuple_state("tuple state", n);
+      edge_view_t tuple_degree("tuple rand", n);
+      vtx_view_t tuple_idx("tuple index", n);
+
+      part_view_t tuple_state_update("tuple state", n);
+      edge_view_t tuple_degree_update("tuple rand", n);
+      vtx_view_t tuple_idx_update("tuple index", n);
+      Kokkos::parallel_for(
+          policy_t(0, n), KOKKOS_LAMBDA(const ordinal_t i) {
+            tuple_state(i)  = state(i);
+            tuple_degree(i) = degrees(i);
+            tuple_idx(i)    = i;
+          });
+
+      Kokkos::parallel_for(
+          policy_t(0, unassigned_total), KOKKOS_LAMBDA(const ordinal_t i) {
+            ordinal_t u              = unassigned(i);
+            int max_state            = tuple_state(u);
+            edge_offset_t max_degree = tuple_degree(u);
+            ordinal_t max_idx        = tuple_idx(u);
+
+            for (edge_offset_t j = g.graph.row_map(u);
+                 j < g.graph.row_map(u + 1); j++) {
+              ordinal_t v = g.graph.entries(j);
+              bool is_max = false;
+              if (tuple_state(v) > max_state) {
+                is_max = true;
+              } else if (tuple_state(v) == max_state) {
+                if (tuple_degree(v) > max_degree) {
+                  is_max = true;
+                } else if (tuple_degree(v) == max_degree) {
+                  if (tuple_idx(v) > max_idx) {
+                    is_max = true;
+                  }
+                }
+              }
+              // pretend edges between two vertices exceeding threshold do not
+              // exist
+              if (degrees(u) > threshold && degrees(v) > threshold) {
+                is_max = false;
+              }
+              if (is_max) {
+                max_state  = tuple_state(v);
+                max_degree = tuple_degree(v);
+                max_idx    = tuple_idx(v);
+              }
+            }
+            tuple_state_update(u)  = max_state;
+            tuple_degree_update(u) = max_degree;
+            tuple_idx_update(u)    = max_idx;
+          });
+
+      Kokkos::parallel_for(
+          policy_t(0, unassigned_total), KOKKOS_LAMBDA(const ordinal_t i) {
+            ordinal_t u     = unassigned(i);
+            tuple_state(u)  = tuple_state_update(u);
+            tuple_degree(u) = tuple_degree_update(u);
+            tuple_idx(u)    = tuple_idx_update(u);
+          });
+
+      ordinal_t next_unassigned_total = 0;
+      Kokkos::parallel_reduce(
+          policy_t(0, unassigned_total),
+          KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) {
+            ordinal_t u = unassigned(i);
+            if (state(u) == 0) {
+              if (tuple_idx(u) == u) {
+                state(u) = 1;
+              }
+              // check if at least one of neighbors are in the IS or will be
+              // placed into the IS
+              else if (tuple_state(u) == 1 ||
+                       tuple_idx(tuple_idx(u)) == tuple_idx(u)) {
+                state(u) = -1;
+              }
+            }
+            if (state(u) == 0) {
+              thread_sum++;
+            }
+          },
+          next_unassigned_total);
+
+      vtx_view_t next_unassigned("next unassigned", next_unassigned_total);
+      Kokkos::parallel_scan(
+          "create next unassigned", policy_t(0, unassigned_total),
+          KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update,
+                        const bool final) {
+            ordinal_t u = unassigned(i);
+            if (state(u) == 0) {
+              if (final) {
+                next_unassigned(update) = u;
+              }
+              update++;
+            }
+          });
+      unassigned_total = next_unassigned_total;
+      unassigned       = next_unassigned;
+    }
+    return state;
+  }
+
+  static matrix_t coarsen_mis_2(const matrix_t& g) {
+    ordinal_t n = g.numRows();
+
+    typename matrix_t::staticcrsgraph_type::entries_type::non_const_value_type
+        nc           = 0;
+    vtx_view_t vcmap = KokkosGraph::graph_mis2_aggregate<
+        Device, typename matrix_t::staticcrsgraph_type::row_map_type,
+        typename matrix_t::staticcrsgraph_type::entries_type, vtx_view_t>(
+        g.graph.row_map, g.graph.entries, nc);
+
+    edge_view_t row_map("interpolate row map", n + 1);
+
+    Kokkos::parallel_for(
+        policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; });
+
+    vtx_view_t entries("interpolate entries", n);
+    wgt_view_t values("interpolate values", n);
+    // compute the interpolation weights
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) {
+          entries(u) = vcmap(u);
+          values(u)  = 1.0;
+        });
+
+    graph_type graph(entries, row_map);
+    matrix_t interp("interpolate", nc, values, graph);
+
+    return interp;
+  }
+
+  static matrix_t coarsen_GOSH(const matrix_t& g) {
+    ordinal_t n = g.numRows();
+
+    part_view_t colors = GOSH_clusters(g);
+
+    Kokkos::View<ordinal_t, Device> nvc("nvertices_coarse");
+    vtx_view_t vcmap("vcmap", n);
+
+    int first_color = 1;
+
+    // create aggregates for color 1
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (colors(i) == first_color) {
+            vcmap(i) = Kokkos::atomic_fetch_add(&nvc(), 1);
+          } else {
+            vcmap(i) = ORD_MAX;
+          }
+        });
+
+    // add unaggregated vertices to aggregate of highest degree neighbor
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (colors(i) != first_color) {
+            // could use a thread team here
+            edge_offset_t max_degree = 0;
+            for (edge_offset_t j = g.graph.row_map(i);
+                 j < g.graph.row_map(i + 1); j++) {
+              ordinal_t v = g.graph.entries(j);
+              edge_offset_t degree =
+                  g.graph.row_map(v + 1) - g.graph.row_map(v);
+              if (colors(v) == first_color && degree > max_degree) {
+                max_degree = degree;
+                vcmap(i)   = vcmap(v);
+              }
+            }
+          }
+        });
+
+    ordinal_t nc = 0;
+    Kokkos::deep_copy(nc, nvc);
+
+    edge_view_t row_map("interpolate row map", n + 1);
+
+    Kokkos::parallel_for(
+        policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; });
+
+    vtx_view_t entries("interpolate entries", n);
+    wgt_view_t values("interpolate values", n);
+    // compute the interpolation weights
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) {
+          entries(u) = vcmap(u);
+          values(u)  = 1.0;
+        });
+
+    graph_type graph(entries, row_map);
+    matrix_t interp("interpolate", nc, values, graph);
+
+    return interp;
+  }
+
+  static matrix_t coarsen_GOSH_v2(const matrix_t& g) {
+    ordinal_t n = g.numRows();
+
+    Kokkos::View<ordinal_t, Device> nvc("nvertices_coarse");
+    vtx_view_t vcmap("vcmap", n);
+
+    edge_offset_t threshold_d = g.nnz() / n;
+    if (threshold_d < 50) {
+      threshold_d = 50;
+    }
+    // create aggregates for large degree vtx
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (g.graph.row_map(i + 1) - g.graph.row_map(i) > threshold_d) {
+            ordinal_t cv = Kokkos::atomic_fetch_add(&nvc(), 1);
+            vcmap(i)     = cv;
+          } else {
+            vcmap(i) = ORD_MAX;
+          }
+        });
+
+    // add vertex to max wgt neighbor's aggregate
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (vcmap(i) == ORD_MAX) {
+            ordinal_t argmax = ORD_MAX;
+            scalar_t max_w   = 0;
+            for (edge_offset_t j = g.graph.row_map(i);
+                 j < g.graph.row_map(i + 1); j++) {
+              ordinal_t v   = g.graph.entries(j);
+              ordinal_t wgt = g.values(j);
+              if (vcmap(v) != ORD_MAX) {
+                if (wgt >= max_w) {
+                  max_w  = wgt;
+                  argmax = v;
+                }
+              }
+            }
+            if (argmax != ORD_MAX) {
+              vcmap(i) = vcmap(argmax);
+            }
+          }
+        });
+
+    // add vertex to max degree neighbor's aggregate
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (vcmap(i) == ORD_MAX) {
+            ordinal_t argmax    = ORD_MAX;
+            edge_offset_t max_d = 0;
+            for (edge_offset_t j = g.graph.row_map(i);
+                 j < g.graph.row_map(i + 1); j++) {
+              ordinal_t v = g.graph.entries(j);
+              edge_offset_t degree =
+                  g.graph.row_map(v + 1) - g.graph.row_map(v);
+              if (vcmap(v) != ORD_MAX) {
+                if (degree >= max_d) {
+                  max_d  = degree;
+                  argmax = v;
+                }
+              }
+            }
+            if (argmax != ORD_MAX) {
+              vcmap(i) = vcmap(argmax);
+            }
+          }
+        });
+
+    // add neighbors of each aggregated vertex to aggregate
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (vcmap(i) != ORD_MAX) {
+            for (edge_offset_t j = g.graph.row_map(i);
+                 j < g.graph.row_map(i + 1); j++) {
+              ordinal_t v = g.graph.entries(j);
+              if (vcmap(v) == ORD_MAX) {
+                vcmap(v) = vcmap(i);
+              }
+            }
+          }
+        });
+
+    ordinal_t remaining_total = 0;
+
+    Kokkos::parallel_reduce(
+        "count remaining", policy_t(0, n),
+        KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& sum) {
+          if (vcmap(i) == ORD_MAX) {
+            sum++;
+          }
+        },
+        remaining_total);
+
+    vtx_view_t remaining("remaining vtx", remaining_total);
+
+    Kokkos::parallel_scan(
+        "count remaining", policy_t(0, n),
+        KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update, const bool final) {
+          if (vcmap(i) == ORD_MAX) {
+            if (final) {
+              remaining(update) = i;
+            }
+            update++;
+          }
+        });
+
+    vtx_view_t hn("heaviest neighbors", n);
+
+    pool_t rand_pool(std::time(nullptr));
+
+    Kokkos::parallel_for(
+        "fill hn", policy_t(0, remaining_total),
+        KOKKOS_LAMBDA(ordinal_t r_idx) {
+          // select heaviest neighbor with ties randomly broken
+          ordinal_t i       = remaining(r_idx);
+          ordinal_t hn_i    = ORD_MAX;
+          uint64_t max_rand = 0;
+          scalar_t max_ewt  = 0;
+
+          edge_offset_t end_offset = g.graph.row_map(i + 1);
+          for (edge_offset_t j = g.graph.row_map(i); j < end_offset; j++) {
+            scalar_t wgt    = g.values(j);
+            ordinal_t v     = g.graph.entries(j);
+            gen_t generator = rand_pool.get_state();
+            uint64_t rand   = generator.urand64();
+            rand_pool.free_state(generator);
+            bool choose = false;
+            if (max_ewt < wgt) {
+              choose = true;
+            } else if (max_ewt == wgt && max_rand <= rand) {
+              choose = true;
+            }
+
+            if (choose) {
+              max_ewt  = wgt;
+              max_rand = rand;
+              hn_i     = v;
+            }
+          }
+          hn(i) = hn_i;
+        });
+
+    ordinal_t nc =
+        parallel_map_construct_prefilled(vcmap, n, remaining, hn, nvc);
+    Kokkos::deep_copy(nc, nvc);
+
+    edge_view_t row_map("interpolate row map", n + 1);
+
+    Kokkos::parallel_for(
+        policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; });
+
+    vtx_view_t entries("interpolate entries", n);
+    wgt_view_t values("interpolate values", n);
+    // compute the interpolation weights
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) {
+          entries(u) = vcmap(u);
+          values(u)  = 1.0;
+        });
+
+    graph_type graph(entries, row_map);
+    matrix_t interp("interpolate", nc, values, graph);
+
+    return interp;
+  }
+
+  static matrix_t coarsen_HEC(const matrix_t& g, bool uniform_weights) {
+    ordinal_t n = g.numRows();
+
+    vtx_view_t hn("heavies", n);
+
+    vtx_view_t vcmap("vcmap", n);
+
+    Kokkos::parallel_for(
+        "initialize vcmap", policy_t(0, n),
+        KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; });
+
+    pool_t rand_pool(std::time(nullptr));
+
+    vtx_view_t vperm = generate_permutation(n, rand_pool);
+
+    vtx_view_t reverse_map("reversed", n);
+    Kokkos::parallel_for(
+        "construct reverse map", policy_t(0, n),
+        KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; });
+
+    if (uniform_weights) {
+      // all weights equal at this level so choose heaviest edge randomly
+      Kokkos::parallel_for(
+          "Random HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+            gen_t generator    = rand_pool.get_state();
+            ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i);
+            if (adj_size > 0) {
+              ordinal_t offset =
+                  g.graph.row_map(i) + (generator.urand64() % adj_size);
+              hn(i) = g.graph.entries(offset);
+            } else {
+              hn(i) = generator.urand64() % n;
+            }
+            rand_pool.free_state(generator);
+          });
+    } else {
+      Kokkos::parallel_for(
+          "Heaviest HN", team_policy_t(n, Kokkos::AUTO),
+          KOKKOS_LAMBDA(const member& thread) {
+            ordinal_t i        = thread.league_rank();
+            ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i);
+            if (adj_size > 0) {
+              edge_offset_t end = g.graph.row_map(i + 1);
+              typename Kokkos::MaxLoc<scalar_t, edge_offset_t,
+                                      Device>::value_type argmax{};
+              Kokkos::parallel_reduce(
+                  Kokkos::TeamThreadRange(thread, g.graph.row_map(i), end),
+                  [=](const edge_offset_t idx,
+                      Kokkos::ValLocScalar<scalar_t, edge_offset_t>& local) {
+                    scalar_t wgt = g.values(idx);
+                    if (wgt >= local.val) {
+                      local.val = wgt;
+                      local.loc = idx;
+                    }
+                  },
+                  Kokkos::MaxLoc<scalar_t, edge_offset_t, Device>(argmax));
+              Kokkos::single(Kokkos::PerTeam(thread), [=]() {
+                ordinal_t h = g.graph.entries(argmax.loc);
+                hn(i)       = h;
+              });
+            } else {
+              gen_t generator = rand_pool.get_state();
+              hn(i)           = generator.urand64() % n;
+              rand_pool.free_state(generator);
+            }
+          });
+    }
+    ordinal_t nc = 0;
+    nc           = parallel_map_construct(vcmap, n, vperm, hn, reverse_map);
+
+    edge_view_t row_map("interpolate row map", n + 1);
+
+    Kokkos::parallel_for(
+        policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; });
+
+    vtx_view_t entries("interpolate entries", n);
+    wgt_view_t values("interpolate values", n);
+    // compute the interpolation weights
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) {
+          entries(u) = vcmap(u);
+          values(u)  = 1.0;
+        });
+
+    graph_type graph(entries, row_map);
+    matrix_t interp("interpolate", nc, values, graph);
+
+    return interp;
+  }
+
+  static ordinal_t countInf(vtx_view_t target) {
+    ordinal_t totalInf = 0;
+
+    Kokkos::parallel_reduce(
+        policy_t(0, target.extent(0)),
+        KOKKOS_LAMBDA(ordinal_t i, ordinal_t & thread_sum) {
+          if (target(i) == ORD_MAX) {
+            thread_sum++;
+          }
+        },
+        totalInf);
+
+    return totalInf;
+  }
+
+  struct MatchByHashSorted {
+    vtx_view_t vcmap, unmapped;
+    Kokkos::View<uint32_t*, Device> hashes;
+    ordinal_t unmapped_total;
+    Kokkos::View<ordinal_t, Device> nvertices_coarse;
+    MatchByHashSorted(vtx_view_t _vcmap, vtx_view_t _unmapped,
+                      Kokkos::View<uint32_t*, Device> _hashes,
+                      ordinal_t _unmapped_total,
+                      Kokkos::View<ordinal_t, Device> _nvertices_coarse)
+        : vcmap(_vcmap),
+          unmapped(_unmapped),
+          hashes(_hashes),
+          unmapped_total(_unmapped_total),
+          nvertices_coarse(_nvertices_coarse) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const ordinal_t i, ordinal_t& update,
+                    const bool final) const {
+      ordinal_t u         = unmapped(i);
+      ordinal_t tentative = 0;
+      if (i == 0) {
+        tentative = i;
+      } else if (hashes(i - 1) != hashes(i)) {
+        tentative = i;
+      }
+
+      if (tentative > update) {
+        update = tentative;
+      }
+
+      if (final) {
+        // update should contain the index of the first hash that equals
+        // hash(i), could be i we want to determine if i is an odd offset from
+        // update
+        ordinal_t isOddOffset = (i - update) & 1;
+        // if even (0 counts as even) we match unmapped(i) with unmapped(i+1) if
+        // hash(i) == hash(i+1) if odd do nothing
+        if (isOddOffset == 0) {
+          if (i + 1 < unmapped_total) {
+            if (hashes(i) == hashes(i + 1)) {
+              ordinal_t v = unmapped(i + 1);
+              vcmap(u)    = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1);
+              vcmap(v)    = vcmap(u);
+            }
+          }
+        }
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile ordinal_t& update,
+              volatile const ordinal_t& input) const {
+      if (input > update) update = input;
+    }
+  };
+
+  static matrix_t coarsen_match(const matrix_t& g, bool uniform_weights,
+                                int match_choice) {
+    ordinal_t n = g.numRows();
+
+    vtx_view_t hn("heavies", n);
+
+    vtx_view_t vcmap("vcmap", n);
+
+    Kokkos::parallel_for(
+        "initialize vcmap", policy_t(0, n),
+        KOKKOS_LAMBDA(ordinal_t i) { vcmap(i) = ORD_MAX; });
+
+    rand_view_t randoms("randoms", n);
+
+    pool_t rand_pool(std::time(nullptr));
+
+    vtx_view_t vperm = generate_permutation(n, rand_pool);
+
+    vtx_view_t reverse_map("reversed", n);
+    Kokkos::parallel_for(
+        "construct reverse map", policy_t(0, n),
+        KOKKOS_LAMBDA(ordinal_t i) { reverse_map(vperm(i)) = i; });
+
+    if (uniform_weights) {
+      // all weights equal at this level so choose heaviest edge randomly
+      Kokkos::parallel_for(
+          "Random HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+            gen_t generator    = rand_pool.get_state();
+            ordinal_t adj_size = g.graph.row_map(i + 1) - g.graph.row_map(i);
+            ordinal_t offset =
+                g.graph.row_map(i) + (generator.urand64() % adj_size);
+            hn(i) = g.graph.entries(offset);
+            rand_pool.free_state(generator);
+          });
+    } else {
+      Kokkos::parallel_for(
+          "Heaviest HN", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+            ordinal_t hn_i   = g.graph.entries(g.graph.row_map(i));
+            scalar_t max_ewt = g.values(g.graph.row_map(i));
+
+            edge_offset_t end_offset =
+                g.graph.row_map(i + 1);  // +g.edges_per_source[i];
+
+            for (edge_offset_t j = g.graph.row_map(i) + 1; j < end_offset;
+                 j++) {
+              if (max_ewt < g.values(j)) {
+                max_ewt = g.values(j);
+                hn_i    = g.graph.entries(j);
+              }
+            }
+            hn(i) = hn_i;
+          });
+    }
+    vtx_view_t match("match", n);
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) { match(i) = ORD_MAX; });
+
+    ordinal_t perm_length = n;
+
+    Kokkos::View<ordinal_t, Device> nvertices_coarse("nvertices");
+
+    // construct mapping using heaviest edges
+    int swap = 1;
+    while (perm_length > 0) {
+      vtx_view_t next_perm("next perm", perm_length);
+      Kokkos::View<ordinal_t, Device> next_length("next_length");
+
+      // match vertices with heaviest unmatched edge
+      Kokkos::parallel_for(
+          policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) {
+            ordinal_t u   = vperm(i);
+            ordinal_t v   = hn(u);
+            int condition = reverse_map(u) < reverse_map(v);
+            // need to enforce an ordering condition to allow hard-stall
+            // conditions to be broken
+            if (condition ^ swap) {
+              if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX,
+                                                         v)) {
+                if (u == v || Kokkos::atomic_compare_exchange_strong(
+                                  &match(v), ORD_MAX, u)) {
+                  // u == v avoids problems if there is a self-loop edge
+                  ordinal_t cv =
+                      Kokkos::atomic_fetch_add(&nvertices_coarse(), 1);
+                  vcmap(u) = cv;
+                  vcmap(v) = cv;
+                } else {
+                  match(u) = ORD_MAX;
+                }
+              }
+            }
+          });
+      Kokkos::fence();
+
+      // add the ones that failed to be reprocessed next round
+      // maybe count these then create next_perm to save memory?
+      Kokkos::parallel_for(
+          policy_t(0, perm_length), KOKKOS_LAMBDA(ordinal_t i) {
+            ordinal_t u = vperm(i);
+            if (vcmap(u) == ORD_MAX) {
+              ordinal_t h = ORD_MAX;
+
+              if (uniform_weights) {
+                ordinal_t max_ewt = 0;
+                // we have to iterate over the edges anyways because we need to
+                // check if any are unmatched! so instead of randomly choosing a
+                // heaviest edge, we instead use the reverse permutation order
+                // as the weight
+                for (edge_offset_t j = g.graph.row_map(u);
+                     j < g.graph.row_map(u + 1); j++) {
+                  ordinal_t v = g.graph.entries(j);
+                  // v must be unmatched to be considered
+                  if (vcmap(v) == ORD_MAX) {
+                    // using <= so that zero weight edges may still be chosen
+                    if (max_ewt <= reverse_map(v)) {
+                      max_ewt = reverse_map(v);
+                      h       = v;
+                    }
+                  }
+                }
+              } else {
+                scalar_t max_ewt = 0;
+                for (edge_offset_t j = g.graph.row_map(u);
+                     j < g.graph.row_map(u + 1); j++) {
+                  ordinal_t v = g.graph.entries(j);
+                  // v must be unmatched to be considered
+                  if (vcmap(v) == ORD_MAX) {
+                    // using <= so that zero weight edges may still be chosen
+                    if (max_ewt <= g.values(j)) {
+                      max_ewt = g.values(j);
+                      h       = v;
+                    }
+                  }
+                }
+              }
+
+              if (h != ORD_MAX) {
+                ordinal_t add_next =
+                    Kokkos::atomic_fetch_add(&next_length(), 1);
+                next_perm(add_next) = u;
+                hn(u)               = h;
+              }
+            }
+          });
+      Kokkos::fence();
+      swap = swap ^ 1;
+      Kokkos::deep_copy(perm_length, next_length);
+      vperm = next_perm;
+    }
+
+    if (match_choice == 1) {
+      ordinal_t unmapped = countInf(vcmap);
+      double unmappedRatio =
+          static_cast<double>(unmapped) / static_cast<double>(n);
+
+      // leaf matches
+      if (unmappedRatio > 0.25) {
+        Kokkos::parallel_for(
+            policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) {
+              if (vcmap(u) != ORD_MAX) {
+                ordinal_t lastLeaf = ORD_MAX;
+                for (edge_offset_t j = g.graph.row_map(u);
+                     j < g.graph.row_map(u + 1); j++) {
+                  ordinal_t v = g.graph.entries(j);
+                  // v must be unmatched to be considered
+                  if (vcmap(v) == ORD_MAX) {
+                    // must be degree 1 to be a leaf
+                    if (g.graph.row_map(v + 1) - g.graph.row_map(v) == 1) {
+                      if (lastLeaf == ORD_MAX) {
+                        lastLeaf = v;
+                      } else {
+                        vcmap(lastLeaf) =
+                            Kokkos::atomic_fetch_add(&nvertices_coarse(), 1);
+                        vcmap(v) = vcmap(lastLeaf);
+                        lastLeaf = ORD_MAX;
+                      }
+                    }
+                  }
+                }
+              }
+            });
+      }
+
+      unmapped      = countInf(vcmap);
+      unmappedRatio = static_cast<double>(unmapped) / static_cast<double>(n);
+
+      // twin matches
+      if (unmappedRatio > 0.25) {
+        vtx_view_t unmappedVtx("unmapped vertices", unmapped);
+        Kokkos::View<uint32_t*, Device> hashes("hashes", unmapped);
+
+        Kokkos::View<ordinal_t, Device> unmappedIdx("unmapped index");
+        hasher_t hasher;
+        // compute digests of adjacency lists
+        Kokkos::parallel_for(
+            "create digests", team_policy_t(n, Kokkos::AUTO),
+            KOKKOS_LAMBDA(const member& thread) {
+              ordinal_t u = thread.league_rank();
+              if (vcmap(u) == ORD_MAX) {
+                uint32_t hash = 0;
+                Kokkos::parallel_reduce(
+                    Kokkos::TeamThreadRange(thread, g.graph.row_map(u),
+                                            g.graph.row_map(u + 1)),
+                    [=](const edge_offset_t j, uint32_t& thread_sum) {
+                      thread_sum += hasher(g.graph.entries(j));
+                    },
+                    hash);
+                Kokkos::single(Kokkos::PerTeam(thread), [=]() {
+                  ordinal_t idx = Kokkos::atomic_fetch_add(&unmappedIdx(), 1);
+                  unmappedVtx(idx) = u;
+                  hashes(idx)      = hash;
+                });
+              }
+            });
+        uint32_t max = std::numeric_limits<uint32_t>::max();
+        typedef Kokkos::BinOp1D<Kokkos::View<uint32_t*, Device> > BinOp;
+        BinOp bin_op(unmapped, 0, max);
+        // VERY important that final parameter is true
+        Kokkos::BinSort<Kokkos::View<uint32_t*, Device>, BinOp, exec_space,
+                        ordinal_t>
+            sorter(hashes, bin_op, true);
+        sorter.create_permute_vector();
+        sorter.template sort<Kokkos::View<uint32_t*, Device> >(hashes);
+        sorter.template sort<vtx_view_t>(unmappedVtx);
+
+        MatchByHashSorted matchTwinFunctor(vcmap, unmappedVtx, hashes, unmapped,
+                                           nvertices_coarse);
+        Kokkos::parallel_scan("match twins", policy_t(0, unmapped),
+                              matchTwinFunctor);
+      }
+
+      unmapped      = countInf(vcmap);
+      unmappedRatio = static_cast<double>(unmapped) / static_cast<double>(n);
+
+      // relative matches
+      if (unmappedRatio > 0.25) {
+        // get possibly mappable vertices of unmapped
+        vtx_view_t mappableVtx("mappable vertices", unmapped);
+        Kokkos::parallel_scan(
+            "get unmapped", policy_t(0, n),
+            KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update,
+                          const bool final) {
+              if (vcmap(i) == ORD_MAX) {
+                if (final) {
+                  mappableVtx(update) = i;
+                }
+
+                update++;
+              }
+            });
+
+        ordinal_t mappable_count = unmapped;
+        do {
+          Kokkos::parallel_for(
+              "reset hn", policy_t(0, mappable_count),
+              KOKKOS_LAMBDA(ordinal_t i) {
+                ordinal_t u = mappableVtx(i);
+                hn(u)       = ORD_MAX;
+              });
+
+          // choose relatives for unmapped vertices
+          Kokkos::parallel_for(
+              "assign relatives", policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+                if (vcmap(i) != ORD_MAX) {
+                  ordinal_t last_free = ORD_MAX;
+                  for (edge_offset_t j = g.graph.row_map(i);
+                       j < g.graph.row_map(i + 1); j++) {
+                    ordinal_t v = g.graph.entries(j);
+                    if (vcmap(v) == ORD_MAX) {
+                      if (last_free != ORD_MAX) {
+                        // there can be multiple threads updating this but it
+                        // doesn't matter as long as they have some value
+                        hn(last_free) = v;
+                        hn(v)         = last_free;
+                        last_free     = ORD_MAX;
+                      } else {
+                        last_free = v;
+                      }
+                    }
+                  }
+                }
+              });
+
+          // create a list of all mappable vertices according to set entries of
+          // hn
+          ordinal_t old_mappable = mappable_count;
+          mappable_count         = 0;
+          Kokkos::parallel_reduce(
+              "count mappable", policy_t(0, old_mappable),
+              KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& thread_sum) {
+                ordinal_t u = mappableVtx(i);
+                if (hn(u) != ORD_MAX) {
+                  thread_sum++;
+                }
+              },
+              mappable_count);
+
+          vtx_view_t nextMappable("next mappable vertices", mappable_count);
+
+          Kokkos::parallel_scan(
+              "get next mappable", policy_t(0, old_mappable),
+              KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update,
+                            const bool final) {
+                ordinal_t u = mappableVtx(i);
+                if (hn(u) != ORD_MAX) {
+                  if (final) {
+                    nextMappable(update) = u;
+                  }
+
+                  update++;
+                }
+              });
+          mappableVtx = nextMappable;
+
+          // match vertices with chosen relative
+          if (mappable_count > 0) {
+            Kokkos::parallel_for(
+                policy_t(0, mappable_count), KOKKOS_LAMBDA(ordinal_t i) {
+                  ordinal_t u   = mappableVtx(i);
+                  ordinal_t v   = hn(u);
+                  int condition = reverse_map(u) < reverse_map(v);
+                  // need to enforce an ordering condition to allow hard-stall
+                  // conditions to be broken
+                  if (condition ^ swap) {
+                    if (Kokkos::atomic_compare_exchange_strong(&match(u),
+                                                               ORD_MAX, v)) {
+                      if (Kokkos::atomic_compare_exchange_strong(&match(v),
+                                                                 ORD_MAX, u)) {
+                        ordinal_t cv =
+                            Kokkos::atomic_fetch_add(&nvertices_coarse(), 1);
+                        vcmap(u) = cv;
+                        vcmap(v) = cv;
+                      } else {
+                        match(u) = ORD_MAX;
+                      }
+                    }
+                  }
+                });
+          }
+          Kokkos::fence();
+          swap = swap ^ 1;
+        } while (mappable_count > 0);
+      }
+    }
+
+    // create singleton aggregates of remaining unmatched vertices
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t i) {
+          if (vcmap(i) == ORD_MAX) {
+            vcmap(i) = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1);
+          }
+        });
+
+    ordinal_t nc = 0;
+    Kokkos::deep_copy(nc, nvertices_coarse);
+
+    edge_view_t row_map("interpolate row map", n + 1);
+
+    Kokkos::parallel_for(
+        policy_t(0, n + 1), KOKKOS_LAMBDA(ordinal_t u) { row_map(u) = u; });
+
+    vtx_view_t entries("interpolate entries", n);
+    wgt_view_t values("interpolate values", n);
+    // compute the interpolation weights
+    Kokkos::parallel_for(
+        policy_t(0, n), KOKKOS_LAMBDA(ordinal_t u) {
+          entries(u) = vcmap(u);
+          values(u)  = 1.0;
+        });
+
+    graph_type graph(entries, row_map);
+    matrix_t interp("interpolate", nc, values, graph);
+
+    return interp;
+  }
+};
+
+}  // end namespace Experimental
+}  // end namespace KokkosGraph
+// exclude from Cuda builds without lambdas enabled
+#endif
diff --git a/unit_test/graph/Test_Graph.hpp b/unit_test/graph/Test_Graph.hpp
index 9cce1f52b2..c2aeeab9d3 100644
--- a/unit_test/graph/Test_Graph.hpp
+++ b/unit_test/graph/Test_Graph.hpp
@@ -5,6 +5,7 @@
 #include "Test_Graph_graph_color_distance2.hpp"
 #include "Test_Graph_graph_color.hpp"
 #include "Test_Graph_mis2.hpp"
+#include "Test_Graph_coarsen.hpp"
 #include "Test_Graph_rcm.hpp"
 
 #endif  // TEST_GRAPH_HPP
diff --git a/unit_test/graph/Test_Graph_coarsen.hpp b/unit_test/graph/Test_Graph_coarsen.hpp
new file mode 100644
index 0000000000..af8ec55a23
--- /dev/null
+++ b/unit_test/graph/Test_Graph_coarsen.hpp
@@ -0,0 +1,481 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Mike Gilbert (msg5334@psu.edu)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <random>
+#include <set>
+#include <list>
+#include <Kokkos_Core.hpp>
+
+#include "KokkosGraph_CoarsenConstruct.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosKernels_IOUtils.hpp"
+#include "KokkosKernels_Handle.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
+
+using namespace KokkosKernels;
+using namespace KokkosKernels::Experimental;
+
+using namespace KokkosGraph;
+using namespace KokkosGraph::Experimental;
+
+// namespace Test {
+
+template <class coarsener_t>
+bool verify_coarsening(typename coarsener_t::coarse_level_triple fine_l,
+                       typename coarsener_t::coarse_level_triple coarse_l) {
+  using crsMat      = typename coarsener_t::matrix_t;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using svt         = typename coarsener_t::wgt_view_t;
+  using ordinal_t   = typename entries_t::value_type;
+  using edge_t      = typename rowmap_t::value_type;
+
+  crsMat A        = fine_l.mtx;
+  crsMat coarse_A = coarse_l.mtx;
+  auto f_rowmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+  auto c_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                      coarse_A.graph.row_map);
+  typename c_entries_t::HostMirror f_entries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+  typename c_entries_t::HostMirror vcmap = Kokkos::create_mirror_view_and_copy(
+      Kokkos::HostSpace(), coarse_l.interp_mtx.graph.entries);
+  typename svt::HostMirror few =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
+  typename svt::HostMirror cew =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarse_A.values);
+  typename entries_t::HostMirror fvw =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), fine_l.vtx_wgts);
+  typename entries_t::HostMirror cvw = Kokkos::create_mirror_view_and_copy(
+      Kokkos::HostSpace(), coarse_l.vtx_wgts);
+  ordinal_t f_size = 0;
+  ordinal_t c_size = 0;
+  for (ordinal_t i = 0; i < static_cast<ordinal_t>(fvw.extent(0)); i++) {
+    f_size += fvw(i);
+  }
+  for (ordinal_t i = 0; i < static_cast<ordinal_t>(cvw.extent(0)); i++) {
+    c_size += cvw(i);
+  }
+  // number of columns in interpolation matrix should give number of rows in
+  // coarse matrix
+  if (coarse_l.interp_mtx.numCols() != coarse_l.mtx.numRows()) {
+    return false;
+  }
+  // sum of vertex weights in each graph should be equal
+  if (f_size != c_size) {
+    return false;
+  }
+  typename svt::value_type f_edges = 0, c_edges = 0;
+  for (ordinal_t i = 0; i < A.numRows(); i++) {
+    for (edge_t j = f_rowmap(i); j < f_rowmap(i + 1); j++) {
+      ordinal_t v = f_entries(j);
+      if (vcmap(i) != vcmap(v)) {
+        f_edges += few(j);
+      }
+    }
+  }
+  for (ordinal_t i = 0; i < coarse_A.numRows(); i++) {
+    for (edge_t j = c_rowmap(i); j < c_rowmap(i + 1); j++) {
+      c_edges += cew(j);
+    }
+  }
+  // sum of inter-aggregate edges in fine graph should be sum of all edges in
+  // coarse graph
+  if (f_edges != c_edges) {
+    return false;
+  }
+  return true;
+}
+
+template <class crsMat>
+bool verify_is_graph(crsMat A) {
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using ordinal_t   = typename entries_t::value_type;
+  using edge_t      = typename rowmap_t::value_type;
+  auto rowmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+  typename c_entries_t::HostMirror entries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+
+  for (ordinal_t i = 0; i < A.numRows(); i++) {
+    std::set<ordinal_t> adjset;
+    for (edge_t j = rowmap(i); j < rowmap(i + 1); j++) {
+      ordinal_t v = entries(j);
+      // A should not contain out-of-bounds columns
+      if (v >= A.numRows()) {
+        return false;
+      }
+      // Each row should not contain duplicate columns
+      if (adjset.find(v) != adjset.end()) {
+        return false;
+      }
+      adjset.insert(v);
+    }
+  }
+  return true;
+}
+
+template <class crsMat>
+bool verify_aggregator(crsMat A, crsMat agg) {
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_entries_t = typename graph_type::entries_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using ordinal_t   = typename entries_t::value_type;
+
+  // aggregator should have as many rows as A
+  if (A.numRows() != agg.numRows()) {
+    return false;
+  }
+  // aggregator should have as many entries as A has rows
+  if (A.numRows() != static_cast<ordinal_t>(agg.nnz())) {
+    return false;
+  }
+  // column count gives number of vertices in coarse graph
+  // aggregator should not have more columns than A has rows
+  // it is valid for coarse graph to be same size as input graph
+  // some graphs (for example a graph with only self-loops) may produce
+  // coarsenings that don't shrink the graph
+  if (A.numRows() < agg.numCols()) {
+    return false;
+  }
+  typename c_entries_t::HostMirror entries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                          agg.graph.entries);
+
+  std::vector<int> aggregateSizes(agg.numCols(), 0);
+  for (ordinal_t i = 0; i < static_cast<ordinal_t>(agg.nnz()); i++) {
+    ordinal_t v = entries(i);
+    // aggregator should not have out-of-bounds columns
+    if (v >= agg.numCols()) {
+      return false;
+    }
+    aggregateSizes[v]++;
+  }
+  for (ordinal_t i = 0; i < agg.numCols(); i++) {
+    // each aggregate label should contain at least one fine vertex
+    if (aggregateSizes[i] == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class crsMat>
+crsMat gen_grid() {
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using svt         = typename crsMat::values_type;
+  using lno_t       = typename crsMat::ordinal_type;
+  using scalar      = typename crsMat::value_type;
+  lno_t rows        = 200;
+  lno_t cols        = 300;
+  lno_t total_vtx   = rows * cols;
+
+  // create a 2D-mesh
+  rowmap_t row_map("rowmap", total_vtx + 1);
+  auto rm = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), row_map);
+  rm(0)   = 0;
+  for (lno_t i = 0; i < rows; i++) {
+    for (lno_t j = 0; j < cols; j++) {
+      lno_t vtx_id     = i * cols + j;
+      lno_t edge_count = 0;
+      if (i > 0) edge_count++;
+      if (i + 1 < rows) edge_count++;
+      if (j > 0) edge_count++;
+      if (j + 1 < cols) edge_count++;
+      rm(vtx_id + 1) = rm(vtx_id) + edge_count;
+    }
+  }
+  lno_t total_edges = rm(total_vtx);
+  Kokkos::deep_copy(row_map, rm);
+  entries_t entries("entries", total_edges);
+  auto e = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
+
+  for (lno_t i = 0; i < rows; i++) {
+    for (lno_t j = 0; j < cols; j++) {
+      lno_t vtx_id       = i * cols + j;
+      lno_t write_offset = rm(vtx_id);
+      if (i > 0) {
+        e(write_offset) = vtx_id - cols;
+        write_offset++;
+      }
+      if (i + 1 < rows) {
+        e(write_offset) = vtx_id + cols;
+        write_offset++;
+      }
+      if (j > 0) {
+        e(write_offset) = vtx_id - 1;
+        write_offset++;
+      }
+      if (j + 1 < cols) {
+        e(write_offset) = vtx_id + 1;
+      }
+    }
+  }
+  Kokkos::deep_copy(entries, e);
+  graph_type g(entries, row_map);
+  svt values("values", total_edges);
+  Kokkos::deep_copy(values, static_cast<scalar>(1));
+  crsMat A("A", total_vtx, values, g);
+  return A;
+}
+
+template <typename scalar, typename lno_t, typename size_type, typename device>
+void test_multilevel_coarsen_grid() {
+  using crsMat =
+      KokkosSparse::CrsMatrix<scalar, lno_t, device, void, size_type>;
+  crsMat A          = gen_grid<crsMat>();
+  using coarsener_t = coarse_builder<crsMat>;
+  typename coarsener_t::coarsen_handle handle;
+  using clt = typename coarsener_t::coarse_level_triple;
+  handle.h  = coarsener_t::HECv1;
+  handle.b  = coarsener_t::Hybrid;
+  coarsener_t::generate_coarse_graphs(handle, A, true);
+  std::list<clt> levels = handle.results;
+  auto fine             = levels.begin();
+  auto coarse           = fine;
+  coarse++;
+  while (coarse != levels.end()) {
+    bool correct_aggregator = verify_aggregator(fine->mtx, coarse->interp_mtx);
+    EXPECT_TRUE(correct_aggregator)
+        << "Multilevel coarsening produced invalid aggregator on level "
+        << coarse->level - 1;
+    bool correct_graph      = verify_is_graph<crsMat>(coarse->mtx);
+    bool correct_coarsening = verify_coarsening<coarsener_t>(*fine, *coarse);
+    EXPECT_TRUE(correct_graph)
+        << "Multilevel coarsening produced invalid graph on level "
+        << coarse->level;
+    EXPECT_TRUE(correct_coarsening)
+        << "Multilevel coarsening produced invalid coarsening on level "
+        << coarse->level;
+    fine++;
+    coarse++;
+  }
+}
+
+template <typename scalar, typename lno_t, typename size_type, typename device>
+void test_coarsen_grid() {
+  using crsMat =
+      KokkosSparse::CrsMatrix<scalar, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_entries_t = typename graph_type::entries_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  crsMat A          = gen_grid<crsMat>();
+  entries_t vWgts("vertex weights", A.numRows());
+  Kokkos::deep_copy(vWgts, static_cast<typename entries_t::value_type>(1));
+  using coarsener_t = coarse_builder<crsMat>;
+  typename coarsener_t::coarsen_handle handle;
+  using clt = typename coarsener_t::coarse_level_triple;
+  clt fine_A;
+  fine_A.mtx                                              = A;
+  fine_A.vtx_wgts                                         = vWgts;
+  fine_A.level                                            = 0;
+  fine_A.uniform_weights                                  = true;
+  std::vector<typename coarsener_t::Heuristic> heuristics = {
+      coarsener_t::HECv1, coarsener_t::Match,  coarsener_t::MtMetis,
+      coarsener_t::MIS2,  coarsener_t::GOSHv1, coarsener_t::GOSHv2};
+  std::vector<typename coarsener_t::Builder> builders = {
+      coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid,
+      coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first};
+  for (auto h : heuristics) {
+    handle.h = h;
+    crsMat aggregator =
+        coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true);
+    bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator);
+    EXPECT_TRUE(correct_aggregator)
+        << "Aggregation heuristic " << static_cast<int>(h)
+        << " produced invalid aggregator.";
+    for (auto b : builders) {
+      handle.b = b;
+      clt coarse_A =
+          coarsener_t::build_coarse_graph(handle, fine_A, aggregator);
+      bool correct_graph = verify_is_graph<crsMat>(coarse_A.mtx);
+      bool correct_coarsening =
+          verify_coarsening<coarsener_t>(fine_A, coarse_A);
+      EXPECT_TRUE(correct_graph)
+          << "Coarsening with dedupe method " << static_cast<int>(b)
+          << " produced invalid graph with aggregation heuristic "
+          << static_cast<int>(h) << ".";
+      EXPECT_TRUE(correct_coarsening)
+          << "Coarsening with dedupe method " << static_cast<int>(b)
+          << " produced invalid coarsening with aggregation heuristic "
+          << static_cast<int>(h) << ".";
+    }
+  }
+}
+
+template <typename scalar, typename lno_t, typename size_type, typename device>
+void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth,
+                         lno_t row_size_variance) {
+  using execution_space = typename device::execution_space;
+  using crsMat =
+      KokkosSparse::CrsMatrix<scalar, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using svt         = typename crsMat::values_type;
+  // Generate graph
+  crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat>(
+      numVerts, numVerts, nnz, row_size_variance, bandwidth);
+  auto G = A.graph;
+  // Symmetrize the graph
+  rowmap_t symRowmap;
+  entries_t symEntries;
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+      c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>(
+      numVerts, G.row_map, G.entries, symRowmap, symEntries);
+  graph_type GS(symEntries, symRowmap);
+  svt symValues("sym values", symEntries.extent(0));
+  entries_t vWgts("vertex weights", numVerts);
+  Kokkos::deep_copy(symValues, static_cast<scalar>(2.5));
+  Kokkos::deep_copy(vWgts, static_cast<typename entries_t::value_type>(1));
+  crsMat AS("A symmetric", numVerts, symValues, GS);
+  using coarsener_t = coarse_builder<crsMat>;
+  typename coarsener_t::coarsen_handle handle;
+  using clt = typename coarsener_t::coarse_level_triple;
+  clt fine_A;
+  fine_A.mtx                                              = AS;
+  fine_A.vtx_wgts                                         = vWgts;
+  fine_A.level                                            = 0;
+  fine_A.uniform_weights                                  = true;
+  std::vector<typename coarsener_t::Heuristic> heuristics = {
+      coarsener_t::HECv1, coarsener_t::Match,  coarsener_t::MtMetis,
+      coarsener_t::MIS2,  coarsener_t::GOSHv1, coarsener_t::GOSHv2};
+  std::vector<typename coarsener_t::Builder> builders = {
+      coarsener_t::Sort, coarsener_t::Hashmap, coarsener_t::Hybrid,
+      coarsener_t::Spgemm, coarsener_t::Spgemm_transpose_first};
+  for (auto h : heuristics) {
+    handle.h = h;
+    crsMat aggregator =
+        coarsener_t::generate_coarse_mapping(handle, fine_A.mtx, true);
+    bool correct_aggregator = verify_aggregator(fine_A.mtx, aggregator);
+    EXPECT_TRUE(correct_aggregator)
+        << "Aggregation heuristic " << static_cast<int>(h)
+        << " produced invalid aggregator.";
+    for (auto b : builders) {
+      handle.b = b;
+      clt coarse_A =
+          coarsener_t::build_coarse_graph(handle, fine_A, aggregator);
+      bool correct_graph = verify_is_graph<crsMat>(coarse_A.mtx);
+      bool correct_coarsening =
+          verify_coarsening<coarsener_t>(fine_A, coarse_A);
+      EXPECT_TRUE(correct_graph)
+          << "Coarsening with dedupe method " << static_cast<int>(b)
+          << " produced invalid graph with aggregation heuristic "
+          << static_cast<int>(h) << ".";
+      EXPECT_TRUE(correct_coarsening)
+          << "Coarsening with dedupe method " << static_cast<int>(b)
+          << " produced invalid coarsening with aggregation heuristic "
+          << static_cast<int>(h) << ".";
+    }
+  }
+}
+
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                         \
+  TEST_F(                                                                                     \
+      TestCategory,                                                                           \
+      graph##_##random_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {          \
+    test_coarsen_random<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20,                     \
+                                                         1000, 10);                           \
+    test_coarsen_random<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10);                \
+    test_coarsen_random<SCALAR, ORDINAL, OFFSET, DEVICE>(5, 5 * 3, 5, 0);                     \
+  }                                                                                           \
+  TEST_F(                                                                                     \
+      TestCategory,                                                                           \
+      graph##_##grid_graph_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {            \
+    test_coarsen_grid<SCALAR, ORDINAL, OFFSET, DEVICE>();                                     \
+  }                                                                                           \
+  TEST_F(                                                                                     \
+      TestCategory,                                                                           \
+      graph##_##grid_graph_multilevel_coarsen##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_multilevel_coarsen_grid<SCALAR, ORDINAL, OFFSET, DEVICE>();                          \
+  }
+
+// FIXME_SYCL
+#ifndef KOKKOS_ENABLE_SYCL
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
+#endif
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
+#endif
+
+#undef EXECUTE_TEST