diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d0faeafca76..5231cc7dd04 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -186,10 +186,8 @@ add_library(cugraph
     src/community/legacy/egonet.cu
     src/sampling/neighborhood.cu
     src/sampling/random_walks.cu
-    src/sampling/detail/gather_utils_impl.cu
     src/sampling/detail/sampling_utils_mg.cu
     src/sampling/detail/sampling_utils_sg.cu
-    src/sampling/nbr_sampling_mg.cu
     src/sampling/uniform_neighbor_sampling_mg.cpp
     src/sampling/uniform_neighbor_sampling_sg.cpp
     src/cores/legacy/core_number.cu
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 6664978a3a1..bbfffe09466 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1468,42 +1468,6 @@ void core_number(raft::handle_t const& handle,
                  size_t k_last           = std::numeric_limits<size_t>::max(),
                  bool do_expensive_check = false);
 
-/**
- * @brief Multi-GPU Uniform Neighborhood Sampling.
- * @deprecated will be removed later in this release (22.06)
- *
- * @tparam graph_view_t Type of graph view.
- * @tparam gpu_t Type of rank (GPU) indices;
- * @tparam index_t Type used for indexing; typically edge_t
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Graph View object to generate NBR Sampling on.
- * @param ptr_d_starting_vertices Device array of starting vertex IDs for the NBR Sampling.
- * @param ptr_d_ranks Device array of: rank IDs (GPU IDs) for the NBR Sampling.
- * @param num_starting_vertices size of starting vertex set
- * @param h_fan_out vector of branching out (fan-out) degree per source vertex for each level
- * parameter used for obtaining local out-degree information
- * @param with_replacement boolean flag specifying if random sampling is done with replacement
- * (true); or, without replacement (false); default = true;
- * @return tuple of tuple of device vectors and counts:
- * ((vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index), rx_counts)
- */
-template <typename graph_view_t,
-          typename gpu_t,
-          typename index_t = typename graph_view_t::edge_type>
-std::tuple<std::tuple<rmm::device_uvector<typename graph_view_t::vertex_type>,
-                      rmm::device_uvector<typename graph_view_t::vertex_type>,
-                      rmm::device_uvector<gpu_t>,
-                      rmm::device_uvector<index_t>>,
-           std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t const& graph_view,
-                   typename graph_view_t::vertex_type const* ptr_d_starting_vertices,
-                   gpu_t const* ptr_d_ranks,
-                   size_t num_starting_vertices,
-                   std::vector<int> const& h_fan_out,
-                   bool with_replacement = true);
-
 /**
  * @brief Uniform Neighborhood Sampling.
  *
diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
index afe841e31cb..520c4272bce 100644
--- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -255,7 +255,7 @@ __global__ void partially_decompress_to_edgelist_mid_degree(
       edge_partition.local_edges(major_partition_offset);
 
     auto major_offset = input_major_start_offsets[idx];
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
       output_majors[major_offset + i] = major;
       output_minors[major_offset + i] = indices[i];
 
@@ -290,7 +290,10 @@ void partially_decompress_edge_partition_to_fill_edgelist(
   vertex_t* minors,
   thrust::optional<weight_t*> weights,
   thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
-  thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
+  thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index,
+  // FIXME:  Once PR 2356 is merged, this parameter could go away because
+  // major_hypersparse_first will be part of edge_partition
+  std::optional<std::vector<vertex_t>> local_edge_partition_segment_offsets)
 {
   auto execution_policy = handle.get_thrust_policy();
   static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
@@ -408,6 +411,10 @@ void partially_decompress_edge_partition_to_fill_edgelist(
                           ? thrust::make_optional(thrust::make_tuple(
                         thrust::get<0>(*property) + segment_offsets[3], thrust::get<1>(*property)))
                           : thrust::nullopt,
+       // FIXME:  Once PR 2356 is merged, this parameter could go away because
+       // major_hypersparse_first will be part of edge_partition
+       segment_offsets_last =
+         (*local_edge_partition_segment_offsets)[detail::num_sparse_segments_per_vertex_partition],
        global_edge_index] __device__(auto idx) {
         auto major        = input_majors[idx];
         auto major_offset = input_major_start_offsets[idx];
@@ -416,7 +423,10 @@ void partially_decompress_edge_partition_to_fill_edgelist(
           vertex_t const* indices{nullptr};
           thrust::optional<weight_t const*> weights{thrust::nullopt};
           edge_t local_degree{};
-          thrust::tie(indices, weights, local_degree) = edge_partition.local_edges(*major_idx);
+          // FIXME:  Once PR 2356 is merged, this computation should be changed to use
+          // major_hypersparse_first which will be part of edge_partition
+          thrust::tie(indices, weights, local_degree) =
+            edge_partition.local_edges(segment_offsets_last + *major_idx);
           thrust::fill(
             thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
           thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
diff --git a/cpp/include/cugraph/detail/graph_functions.cuh b/cpp/include/cugraph/detail/graph_functions.cuh
deleted file mode 100644
index 049c7ba2a05..00000000000
--- a/cpp/include/cugraph/detail/graph_functions.cuh
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cugraph/edge_partition_device_view.cuh>
-#include <cugraph/partition_manager.hpp>
-#include <cugraph/utilities/device_comm.cuh>
-#include <cugraph/utilities/host_scalar_comm.cuh>
-
-#include <raft/handle.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/tabulate.h>
-
-#include <rmm/device_uvector.hpp>
-
-#include <numeric>
-#include <vector>
-
-namespace cugraph {
-namespace detail {
-namespace original {
-
-/**
- * @brief Compute local out degrees of the majors belonging to the adjacency matrices
- * stored on each gpu
- *
- * Iterate through partitions and store their local degrees
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @return A single vector containing the local out degrees of the majors belong to the adjacency
- * matrices
- */
-template <typename GraphViewType>
-rmm::device_uvector<typename GraphViewType::edge_type> compute_local_major_degrees(
-  raft::handle_t const& handle, GraphViewType const& graph_view);
-
-/**
- * @brief Calculate global degree information for all vertices represented by current gpu
- *
- * Calculate local degree and perform row wise exclusive scan over all gpus in column
- * communicator.
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @return Tuple of two device vectors. The first one contains per source edge-count encountered
- * by gpus in the column communicator before current gpu. The second device vector contains the
- * global out degree for every source represented by current gpu
- */
-template <typename GraphViewType>
-std::tuple<rmm::device_uvector<typename GraphViewType::edge_type>,
-           rmm::device_uvector<typename GraphViewType::edge_type>>
-get_global_degree_information(raft::handle_t const& handle, GraphViewType const& graph_view);
-
-/**
- * @brief Calculate global adjacency offset for all majors represented by current gpu
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param[in] global_degree_offsets Global degree offset to local adjacency list for every major
- * represented by current gpu
- * @param global_out_degrees Global out degrees for every source represented by current gpu
- * @return Device vector containing the number of edges that are prior to the adjacency list of
- * every major that can be represented by the current gpu
- */
-template <typename GraphViewType>
-rmm::device_uvector<typename GraphViewType::edge_type> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_out_degrees);
-
-/**
- * @brief Gather active majors and associated client gpu ids across gpus in a
- * column communicator
- *
- * Collect all the vertex ids and client gpu ids to be processed by every gpu in
- * the column communicator and call sort on the list.
- *
- * @tparam vertex_t Type of vertex indices.
- * @tparam VertexIterator  Type of the iterator for vertex identifiers.
- * @tparam GPUIdIterator  Type of the iterator for gpu id identifiers.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param vertex_input_first Iterator pointing to the first vertex id to be processed
- * @param vertex_input_last Iterator pointing to the last (exclusive) vertex id to be processed
- * @param gpu_id_first Iterator pointing to the first gpu id to be processed
- * @return Device vector containing all the vertices that are to be processed by every gpu
- * in the column communicator
- */
-template <typename GraphViewType, typename VertexIterator, typename GPUIdIterator>
-std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename std::iterator_traits<GPUIdIterator>::value_type>>
-gather_active_majors(raft::handle_t const& handle,
-                     GraphViewType const& graph_view,
-                     VertexIterator vertex_input_first,
-                     VertexIterator vertex_input_last,
-                     GPUIdIterator gpu_id_first);
-
-/**
- * @brief Return global out degrees of active majors
- *
- * Get partition information of all graph partitions on the gpu and select
- * global degrees of all active majors
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param active_majors Device vector containing all the vertex id that are processed by
- * gpus in the column communicator
- * @param global_out_degrees Global out degrees for every source represented by current gpu
- * @return Global out degrees of all majors in active_majors
- */
-template <typename GraphViewType>
-rmm::device_uvector<typename GraphViewType::edge_type> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_out_degrees);
-
-/**
- * @brief Return partition information of all vertex ids of all the partitions belonging to a gpu
- *
- * Iterate through partitions and store the starting vertex ids, exclusive scan of vertex counts,
- * offsets and indices of the partitions csr structure
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @return Tuple of device vectors. The first vector contains all the partitions related to the
- * gpu. The second and third vectors contain starting and ending vertex ids of all the partitions
- * belonging to the gpu. The fourth vector contains the starting vertex id of the hypersparse
- * region in each partition. The fifth vector denotes the vertex count offset (how many vertices
- * are dealt with by the previous partitions.
- */
-template <typename GraphViewType>
-std::tuple<rmm::device_uvector<edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                                                            typename GraphViewType::edge_type,
-                                                            typename GraphViewType::weight_type,
-                                                            GraphViewType::is_multi_gpu>>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>>
-partition_information(raft::handle_t const& handle, GraphViewType const& graph_view);
-
-/**
- * @brief Gather valid edges present on the current gpu
- *
- * Collect all the edges that are present in the adjacency lists on the current gpu
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam gpu_t  Type of gpu id identifiers.
- * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
- * and handles to various CUDA libraries) to run graph algorithms.
- * @param[in] graph_view Non-owning graph object.
- * @param[in] active_majors Device vector containing all the vertex id that are processed by
- * gpus in the column communicator
- * @param[in] active_major_gpu_ids Device vector containing the gpu id associated by every vertex
- * present in active_majors
- * @param[in] minor_map Device vector of minor indices (modifiable in-place) corresponding to
- * vertex IDs being returned
- * @param[in] indices_per_major Number of indices supplied for every major in the range
- * [vertex_input_first, vertex_input_last)
- * @param[in] global_degree_offsets Global degree offset to local adjacency list for every major
- * represented by current gpu
- * @return A tuple of device vector containing the majors, minors, gpu_ids and indices gathered
- * locally
- */
-template <typename GraphViewType, typename gpu_t>
-std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<gpu_t>,
-           rmm::device_uvector<typename GraphViewType::edge_type>>
-gather_local_edges(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
-  const rmm::device_uvector<gpu_t>& active_major_gpu_ids,
-  rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
-  typename GraphViewType::edge_type indices_per_major,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets);
-
-/**
- * @brief Gather edge list for specified vertices
- *
- * Collect all the edges that are present in the adjacency lists on the current gpu
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam prop_t  Type of the property associated with the majors.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param active_majors Device vector containing all the vertex id that are processed by
- * gpus in the column communicator
- * @param active_major_property Device vector containing the property values associated by every
- * vertex present in active_majors
- * @return A tuple of device vector containing the majors, minors and properties gathered locally
- */
-template <typename GraphViewType, typename prop_t>
-std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<prop_t>,
-           rmm::device_uvector<typename GraphViewType::edge_type>>
-gather_one_hop_edgelist(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
-  const rmm::device_uvector<prop_t>& active_major_property,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets);
-
-}  // namespace original
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index 16c1a9011d4..dbefac81742 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -122,9 +122,6 @@ typedef struct {
  * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
  *                           needs to be transposed
  * @param [in]  start        Device array of start vertices for the sampling
- * @param [in]  start_label  Device array of start labels.  These labels will propagate to the
- * results so that the result can be properly organized when the input needs to be sent back to
- * different callers (different processes or different gpus).
  * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm
  * @param [in]  with_replacement
  *                           Boolean value.  If true selection of edges is done with
@@ -136,37 +133,7 @@ typedef struct {
  *                           be populated if error code is not CUGRAPH_SUCCESS
  * @return error code
  */
-// FIXME: This older API will be phased out this release in favor of the experimental one below
 cugraph_error_code_t cugraph_uniform_neighbor_sample(
-  const cugraph_resource_handle_t* handle,
-  cugraph_graph_t* graph,
-  const cugraph_type_erased_device_array_view_t* start,
-  const cugraph_type_erased_device_array_view_t* start_label,
-  const cugraph_type_erased_host_array_view_t* fan_out,
-  bool_t with_replacement,
-  bool_t do_expensive_check,
-  cugraph_sample_result_t** result,
-  cugraph_error_t** error);
-
-/**
- * @brief     Uniform Neighborhood Sampling
- *
- * @param [in]  handle       Handle for accessing resources
- * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
- *                           needs to be transposed
- * @param [in]  start        Device array of start vertices for the sampling
- * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm
- * @param [in]  with_replacement
- *                           Boolean value.  If true selection of edges is done with
- *                           replacement.  If false selection is done without replacement.
- * @param [in]  do_expensive_check
- *                           A flag to run expensive checks for input arguments (if set to true)
- * @param [in]  result       Output from the uniform_neighbor_sample call
- * @param [out] error        Pointer to an error object storing details of any error.  Will
- *                           be populated if error code is not CUGRAPH_SUCCESS
- * @return error code
- */
-cugraph_error_code_t cugraph_experimental_uniform_neighbor_sample(
   const cugraph_resource_handle_t* handle,
   cugraph_graph_t* graph,
   const cugraph_type_erased_device_array_view_t* start,
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 612284c93c8..ed458eaf1cd 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -32,10 +32,11 @@ namespace cugraph {
 namespace c_api {
 
 struct cugraph_sample_result_t {
-  bool experimental_{true};
   cugraph_type_erased_device_array_t* src_{nullptr};
   cugraph_type_erased_device_array_t* dst_{nullptr};
   // FIXME: Will be deleted once experimental replaces current
+  //    NOTE: Leaving in place while we discuss some future changes, although
+  //          not currently used.
   cugraph_type_erased_device_array_t* label_{nullptr};
   cugraph_type_erased_device_array_t* index_{nullptr};
   // FIXME: Will be deleted once experimental replaces current
@@ -53,7 +54,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
   raft::handle_t const& handle_;
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_{nullptr};
-  cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_label_{nullptr};
   cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr};
   bool with_replacement_{false};
   bool do_expensive_check_{false};
@@ -62,125 +62,9 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
   uniform_neighbor_sampling_functor(cugraph_resource_handle_t const* handle,
                                     cugraph_graph_t* graph,
                                     cugraph_type_erased_device_array_view_t const* start,
-                                    cugraph_type_erased_device_array_view_t const* start_label,
                                     cugraph_type_erased_host_array_view_t const* fan_out,
                                     bool with_replacement,
                                     bool do_expensive_check)
-    : abstract_functor(),
-      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
-      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
-      start_(
-        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(start)),
-      start_label_(reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
-        start_label)),
-      fan_out_(
-        reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)),
-      with_replacement_(with_replacement),
-      do_expensive_check_(do_expensive_check)
-  {
-  }
-
-  template <typename vertex_t,
-            typename edge_t,
-            typename weight_t,
-            bool store_transposed,
-            bool multi_gpu>
-  void operator()()
-  {
-    // FIXME: Think about how to handle SG vice MG
-    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
-      unsupported();
-    } else if constexpr (!multi_gpu) {
-      unsupported();
-    } else {
-      // uniform_nbr_sample expects store_transposed == false
-      if constexpr (store_transposed) {
-        error_code_ = cugraph::c_api::
-          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
-            handle_, graph_, error_.get());
-        if (error_code_ != CUGRAPH_SUCCESS) return;
-      }
-
-      auto graph =
-        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, weight_t, false, multi_gpu>*>(
-          graph_->graph_);
-
-      auto graph_view = graph->view();
-
-      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
-
-      rmm::device_uvector<vertex_t> start(start_->size_, handle_.get_stream());
-      raft::copy(start.data(), start_->as_type<vertex_t>(), start.size(), handle_.get_stream());
-
-      //
-      // Need to renumber sources
-      //
-      cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
-        handle_,
-        start.data(),
-        start.size(),
-        number_map->data(),
-        graph_view.local_vertex_partition_range_first(),
-        graph_view.local_vertex_partition_range_last(),
-        false);
-
-      // C++ API wants an std::vector
-      std::vector<int> fan_out(fan_out_->size_);
-      std::copy_n(fan_out_->as_type<int>(), fan_out_->size_, fan_out.data());
-
-      auto&& [tmp_tuple, counts] = cugraph::uniform_nbr_sample(handle_,
-                                                               graph_view,
-                                                               start.data(),
-                                                               start_label_->as_type<int32_t>(),
-                                                               start.size(),
-                                                               fan_out,
-                                                               with_replacement_);
-
-      auto&& [srcs, dsts, labels, indices] = tmp_tuple;
-
-      std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
-
-      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
-                                                            srcs.data(),
-                                                            srcs.size(),
-                                                            number_map->data(),
-                                                            vertex_partition_lasts,
-                                                            do_expensive_check_);
-
-      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
-                                                            dsts.data(),
-                                                            dsts.size(),
-                                                            number_map->data(),
-                                                            vertex_partition_lasts,
-                                                            do_expensive_check_);
-
-      result_ = new cugraph::c_api::cugraph_sample_result_t{
-        false,
-        new cugraph::c_api::cugraph_type_erased_device_array_t(srcs, graph_->vertex_type_),
-        new cugraph::c_api::cugraph_type_erased_device_array_t(dsts, graph_->vertex_type_),
-        new cugraph::c_api::cugraph_type_erased_device_array_t(labels, start_label_->type_),
-        new cugraph::c_api::cugraph_type_erased_device_array_t(indices, graph_->edge_type_),
-        new cugraph::c_api::cugraph_type_erased_host_array_t(counts, graph_->vertex_type_)};
-    }
-  }
-};
-
-struct experimental_uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_functor {
-  raft::handle_t const& handle_;
-  cugraph::c_api::cugraph_graph_t* graph_{nullptr};
-  cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_{nullptr};
-  cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr};
-  bool with_replacement_{false};
-  bool do_expensive_check_{false};
-  cugraph::c_api::cugraph_sample_result_t* result_{nullptr};
-
-  experimental_uniform_neighbor_sampling_functor(
-    cugraph_resource_handle_t const* handle,
-    cugraph_graph_t* graph,
-    cugraph_type_erased_device_array_view_t const* start,
-    cugraph_type_erased_host_array_view_t const* fan_out,
-    bool with_replacement,
-    bool do_expensive_check)
     : abstract_functor(),
       handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
       graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
@@ -259,7 +143,6 @@ struct experimental_uniform_neighbor_sampling_functor : public cugraph::c_api::a
                                                             do_expensive_check_);
 
       result_ = new cugraph::c_api::cugraph_sample_result_t{
-        true,
         new cugraph::c_api::cugraph_type_erased_device_array_t(srcs, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(dsts, graph_->vertex_type_),
         nullptr,
@@ -276,7 +159,6 @@ extern "C" cugraph_error_code_t cugraph_uniform_neighbor_sample(
   const cugraph_resource_handle_t* handle,
   cugraph_graph_t* graph,
   const cugraph_type_erased_device_array_view_t* start,
-  const cugraph_type_erased_device_array_view_t* start_labels,
   const cugraph_type_erased_host_array_view_t* fan_out,
   bool_t with_replacement,
   bool_t do_expensive_check,
@@ -284,21 +166,6 @@ extern "C" cugraph_error_code_t cugraph_uniform_neighbor_sample(
   cugraph_error_t** error)
 {
   uniform_neighbor_sampling_functor functor{
-    handle, graph, start, start_labels, fan_out, with_replacement, do_expensive_check};
-  return cugraph::c_api::run_algorithm(graph, functor, result, error);
-}
-
-extern "C" cugraph_error_code_t cugraph_experimental_uniform_neighbor_sample(
-  const cugraph_resource_handle_t* handle,
-  cugraph_graph_t* graph,
-  const cugraph_type_erased_device_array_view_t* start,
-  const cugraph_type_erased_host_array_view_t* fan_out,
-  bool_t with_replacement,
-  bool_t do_expensive_check,
-  cugraph_sample_result_t** result,
-  cugraph_error_t** error)
-{
-  experimental_uniform_neighbor_sampling_functor functor{
     handle, graph, start, fan_out, with_replacement, do_expensive_check};
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
diff --git a/cpp/src/sampling/detail/gather_utils_impl.cu b/cpp/src/sampling/detail/gather_utils_impl.cu
deleted file mode 100644
index 3c8a7e2d16b..00000000000
--- a/cpp/src/sampling/detail/gather_utils_impl.cu
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sampling/detail/gather_utils_impl.cuh>
-
-namespace cugraph {
-namespace detail {
-namespace original {
-
-template rmm::device_uvector<int32_t> compute_local_major_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, float, false, true> const& graph_view);
-
-template rmm::device_uvector<int32_t> compute_local_major_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, double, false, true> const& graph_view);
-
-template rmm::device_uvector<int64_t> compute_local_major_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, float, false, true> const& graph_view);
-
-template rmm::device_uvector<int64_t> compute_local_major_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, double, false, true> const& graph_view);
-
-template rmm::device_uvector<int64_t> compute_local_major_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, float, false, true> const& graph_view);
-
-template rmm::device_uvector<int64_t> compute_local_major_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, double, false, true> const& graph_view);
-
-template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-get_global_degree_information(raft::handle_t const& handle,
-                              graph_view_t<int32_t, int32_t, float, false, true> const& graph_view);
-
-template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-get_global_degree_information(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, double, false, true> const& graph_view);
-
-template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
-get_global_degree_information(raft::handle_t const& handle,
-                              graph_view_t<int32_t, int64_t, float, false, true> const& graph_view);
-
-template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
-get_global_degree_information(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, double, false, true> const& graph_view);
-
-template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
-get_global_degree_information(raft::handle_t const& handle,
-                              graph_view_t<int64_t, int64_t, float, false, true> const& graph_view);
-
-template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
-get_global_degree_information(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, double, false, true> const& graph_view);
-
-template rmm::device_uvector<int32_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
-  rmm::device_uvector<int32_t> const& global_degree_offsets,
-  rmm::device_uvector<int32_t> const& global_out_degrees);
-
-template rmm::device_uvector<int32_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
-  rmm::device_uvector<int32_t> const& global_degree_offsets,
-  rmm::device_uvector<int32_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-gather_active_majors(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
-                     int32_t const* vertex_input_first,
-                     int32_t const* vertex_input_last,
-                     int32_t const* gpu_id_first);
-
-template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-gather_active_majors(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
-                     int32_t const* vertex_input_first,
-                     int32_t const* vertex_input_last,
-                     int32_t const* gpu_id_first);
-
-template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-gather_active_majors(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
-                     int32_t const* vertex_input_first,
-                     int32_t const* vertex_input_last,
-                     int32_t const* gpu_id_first);
-
-template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-gather_active_majors(raft::handle_t const& handle,
-                     graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
-                     int32_t const* vertex_input_first,
-                     int32_t const* vertex_input_last,
-                     int32_t const* gpu_id_first);
-
-template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
-gather_active_majors(raft::handle_t const& handle,
-                     graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
-                     int64_t const* vertex_input_first,
-                     int64_t const* vertex_input_last,
-                     int32_t const* gpu_id_first);
-
-template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
-gather_active_majors(raft::handle_t const& handle,
-                     graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
-                     int64_t const* vertex_input_first,
-                     int64_t const* vertex_input_last,
-                     int32_t const* gpu_id_first);
-
-template rmm::device_uvector<int32_t> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
-  const rmm::device_uvector<int32_t>& active_majors,
-  const rmm::device_uvector<int32_t>& global_out_degrees);
-
-template rmm::device_uvector<int32_t> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
-  const rmm::device_uvector<int32_t>& active_majors,
-  const rmm::device_uvector<int32_t>& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
-  const rmm::device_uvector<int32_t>& active_majors,
-  const rmm::device_uvector<int64_t>& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
-  const rmm::device_uvector<int32_t>& active_majors,
-  const rmm::device_uvector<int64_t>& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
-  const rmm::device_uvector<int64_t>& active_majors,
-  const rmm::device_uvector<int64_t>& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
-  const rmm::device_uvector<int64_t>& active_majors,
-  const rmm::device_uvector<int64_t>& global_out_degrees);
-
-template std::tuple<
-  rmm::device_uvector<edge_partition_device_view_t<int32_t, int32_t, float, true>>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>>
-partition_information(raft::handle_t const& handle,
-                      graph_view_t<int32_t, int32_t, float, false, true> const& graph_view);
-
-template std::tuple<
-  rmm::device_uvector<edge_partition_device_view_t<int32_t, int32_t, double, true>>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>>
-partition_information(raft::handle_t const& handle,
-                      graph_view_t<int32_t, int32_t, double, false, true> const& graph_view);
-
-template std::tuple<
-  rmm::device_uvector<edge_partition_device_view_t<int32_t, int64_t, float, true>>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>>
-partition_information(raft::handle_t const& handle,
-                      graph_view_t<int32_t, int64_t, float, false, true> const& graph_view);
-
-template std::tuple<
-  rmm::device_uvector<edge_partition_device_view_t<int32_t, int64_t, double, true>>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>,
-  rmm::device_uvector<int32_t>>
-partition_information(raft::handle_t const& handle,
-                      graph_view_t<int32_t, int64_t, double, false, true> const& graph_view);
-
-template std::tuple<
-  rmm::device_uvector<edge_partition_device_view_t<int64_t, int64_t, float, true>>,
-  rmm::device_uvector<int64_t>,
-  rmm::device_uvector<int64_t>,
-  rmm::device_uvector<int64_t>,
-  rmm::device_uvector<int64_t>>
-partition_information(raft::handle_t const& handle,
-                      graph_view_t<int64_t, int64_t, float, false, true> const& graph_view);
-
-template std::tuple<
-  rmm::device_uvector<edge_partition_device_view_t<int64_t, int64_t, double, true>>,
-  rmm::device_uvector<int64_t>,
-  rmm::device_uvector<int64_t>,
-  rmm::device_uvector<int64_t>,
-  rmm::device_uvector<int64_t>>
-partition_information(raft::handle_t const& handle,
-                      graph_view_t<int64_t, int64_t, double, false, true> const& graph_view);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>>
-gather_local_edges(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
-                   const rmm::device_uvector<int32_t>& active_majors,
-                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                   rmm::device_uvector<int32_t>&& minor_map,
-                   int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets,
-                   const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>>
-gather_local_edges(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
-                   const rmm::device_uvector<int32_t>& active_majors,
-                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                   rmm::device_uvector<int32_t>&& minor_map,
-                   int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets,
-                   const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_local_edges(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
-                   const rmm::device_uvector<int32_t>& active_majors,
-                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                   rmm::device_uvector<int64_t>&& minor_map,
-                   int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_local_edges(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
-                   const rmm::device_uvector<int32_t>& active_majors,
-                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                   rmm::device_uvector<int64_t>&& minor_map,
-                   int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_local_edges(raft::handle_t const& handle,
-                   graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
-                   const rmm::device_uvector<int64_t>& active_majors,
-                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                   rmm::device_uvector<int64_t>&& minor_map,
-                   int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_local_edges(raft::handle_t const& handle,
-                   graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
-                   const rmm::device_uvector<int64_t>& active_majors,
-                   const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                   rmm::device_uvector<int64_t>&& minor_map,
-                   int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>>
-gather_one_hop_edgelist(raft::handle_t const& handle,
-                        graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
-                        const rmm::device_uvector<int32_t>& active_majors,
-                        const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                        const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>>
-gather_one_hop_edgelist(raft::handle_t const& handle,
-                        graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
-                        const rmm::device_uvector<int32_t>& active_majors,
-                        const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                        const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_one_hop_edgelist(raft::handle_t const& handle,
-                        graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
-                        const rmm::device_uvector<int32_t>& active_majors,
-                        const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                        const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_one_hop_edgelist(raft::handle_t const& handle,
-                        graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
-                        const rmm::device_uvector<int32_t>& active_majors,
-                        const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                        const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_one_hop_edgelist(raft::handle_t const& handle,
-                        graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
-                        const rmm::device_uvector<int64_t>& active_majors,
-                        const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                        const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int64_t>>
-gather_one_hop_edgelist(raft::handle_t const& handle,
-                        graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
-                        const rmm::device_uvector<int64_t>& active_majors,
-                        const rmm::device_uvector<int32_t>& active_major_gpu_ids,
-                        const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
-
-}  // namespace original
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/sampling/detail/gather_utils_impl.cuh b/cpp/src/sampling/detail/gather_utils_impl.cuh
deleted file mode 100644
index fe8f04adcbb..00000000000
--- a/cpp/src/sampling/detail/gather_utils_impl.cuh
+++ /dev/null
@@ -1,775 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cugraph/detail/decompress_edge_partition.cuh>
-#include <cugraph/detail/graph_functions.cuh>
-#include <cugraph/edge_partition_device_view.cuh>
-#include <cugraph/partition_manager.hpp>
-#include <cugraph/utilities/device_comm.cuh>
-#include <cugraph/utilities/host_scalar_comm.cuh>
-
-#include <raft/handle.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/memory.h>
-#include <thrust/optional.h>
-#include <thrust/remove.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
-#include <thrust/tabulate.h>
-#include <thrust/transform.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/tuple.h>
-
-#include <rmm/device_uvector.hpp>
-
-#include <numeric>
-#include <vector>
-
-namespace cugraph {
-namespace detail {
-namespace original {
-
-template <typename GraphViewType>
-rmm::device_uvector<typename GraphViewType::edge_type> compute_local_major_degrees(
-  raft::handle_t const& handle, GraphViewType const& graph_view)
-{
-  static_assert(GraphViewType::is_storage_transposed == false);
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using weight_t = typename GraphViewType::weight_type;
-
-  rmm::device_uvector<edge_t> local_degrees(GraphViewType::is_storage_transposed
-                                              ? graph_view.local_edge_partition_dst_range_size()
-                                              : graph_view.local_edge_partition_src_range_size(),
-                                            handle.get_stream());
-
-  // FIXME optimize for communication
-  // local_edge_partition_src_range_size == summation of major_range_size() of all partitions
-  // belonging to the gpu
-  vertex_t partial_offset{0};
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto edge_partition =
-      edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-
-    // Check if hypersparse segment is present in the partition
-    auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
-    auto use_dcs         = segment_offsets
-                             ? ((*segment_offsets).size() > (num_sparse_segments_per_vertex_partition + 1))
-                             : false;
-
-    if (use_dcs) {
-      auto major_hypersparse_first = edge_partition.major_range_first() +
-                                     (*segment_offsets)[num_sparse_segments_per_vertex_partition];
-      // Calculate degrees in sparse region
-      auto sparse_begin = local_degrees.begin() + partial_offset;
-      auto sparse_end   = local_degrees.begin() + partial_offset +
-                        (major_hypersparse_first - edge_partition.major_range_first());
-      ;
-
-      thrust::tabulate(handle.get_thrust_policy(),
-                       sparse_begin,
-                       sparse_end,
-                       [offsets = edge_partition.offsets()] __device__(auto i) {
-                         return offsets[i + 1] - offsets[i];
-                       });
-
-      // Calculate degrees in hypersparse region
-      auto dcs_nzd_vertex_count = *(edge_partition.dcs_nzd_vertex_count());
-      // Initialize hypersparse region degrees as 0
-      thrust::fill(handle.get_thrust_policy(),
-                   sparse_end,
-                   sparse_begin + edge_partition.major_range_size(),
-                   edge_t{0});
-      thrust::for_each(handle.get_thrust_policy(),
-                       thrust::make_counting_iterator(vertex_t{0}),
-                       thrust::make_counting_iterator(dcs_nzd_vertex_count),
-                       [major_hypersparse_first,
-                        major_range_first = edge_partition.major_range_first(),
-                        vertex_ids        = *(edge_partition.dcs_nzd_vertices()),
-                        offsets           = edge_partition.offsets(),
-                        local_degrees = thrust::raw_pointer_cast(sparse_begin)] __device__(auto i) {
-                         auto d = offsets[(major_hypersparse_first - major_range_first) + i + 1] -
-                                  offsets[(major_hypersparse_first - major_range_first) + i];
-                         auto v                               = vertex_ids[i];
-                         local_degrees[v - major_range_first] = d;
-                       });
-    } else {
-      auto sparse_begin = local_degrees.begin() + partial_offset;
-      auto sparse_end = local_degrees.begin() + partial_offset + edge_partition.major_range_size();
-      thrust::tabulate(handle.get_thrust_policy(),
-                       sparse_begin,
-                       sparse_end,
-                       [offsets = edge_partition.offsets()] __device__(auto i) {
-                         return offsets[i + 1] - offsets[i];
-                       });
-    }
-    partial_offset += edge_partition.major_range_size();
-  }
-  return local_degrees;
-}
-
-template <typename GraphViewType>
-rmm::device_uvector<typename GraphViewType::edge_type> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_out_degrees)
-{
-  static_assert(GraphViewType::is_multi_gpu == true);
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using weight_t = typename GraphViewType::weight_type;
-  // auto const& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-
-  auto& comm           = handle.get_comms();
-  auto const comm_size = comm.get_size();
-  auto const comm_rank = comm.get_rank();
-  auto& col_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_size  = col_comm.get_size();
-  auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_rank  = row_comm.get_rank();
-  auto const row_size  = row_comm.get_size();
-
-  rmm::device_uvector<edge_t> global_adjacency_list_offsets(global_degree_offsets.size(),
-                                                            handle.get_stream());
-
-  edge_t edge_count_in_all_previous_partitions{0};
-  vertex_t vertex_offset{0};
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto edge_partition =
-      edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-    auto edge_counts =
-      cugraph::host_scalar_allgather(comm, edge_partition.number_of_edges(), handle.get_stream());
-    edge_t partial_edge_count{0};
-    for (int r = 0; r < row_rank; ++r) {
-      for (int c = 0; c < col_size; ++c) {
-        partial_edge_count += edge_counts[r + c * row_size];
-      }
-    }
-    thrust::exclusive_scan(
-      handle.get_thrust_policy(),
-      global_out_degrees.cbegin() + vertex_offset,
-      global_out_degrees.cbegin() + vertex_offset + edge_partition.major_range_size(),
-      global_adjacency_list_offsets.begin() + vertex_offset);
-
-    thrust::transform(
-      handle.get_thrust_policy(),
-      global_adjacency_list_offsets.cbegin() + vertex_offset,
-      global_adjacency_list_offsets.cbegin() + vertex_offset + edge_partition.major_range_size(),
-      global_degree_offsets.cbegin() + vertex_offset,
-      global_adjacency_list_offsets.begin() + vertex_offset,
-      [offset = edge_count_in_all_previous_partitions + partial_edge_count] __device__(
-        auto val0, auto val1) { return val0 + val1 + offset; });
-
-    edge_count_in_all_previous_partitions +=
-      std::accumulate(edge_counts.begin(), edge_counts.end(), edge_t{0});
-    vertex_offset += edge_partition.major_range_size();
-  }
-  return global_adjacency_list_offsets;
-}
-
-template <typename GraphViewType>
-std::tuple<rmm::device_uvector<typename GraphViewType::edge_type>,
-           rmm::device_uvector<typename GraphViewType::edge_type>>
-get_global_degree_information(raft::handle_t const& handle, GraphViewType const& graph_view)
-{
-  static_assert(GraphViewType::is_multi_gpu == true);
-  using edge_t       = typename GraphViewType::edge_type;
-  auto local_degrees = compute_local_major_degrees(handle, graph_view);
-
-  auto& col_comm      = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_size = col_comm.get_size();
-  auto const col_rank = col_comm.get_rank();
-  auto& row_comm      = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_size = row_comm.get_size();
-
-  auto& comm           = handle.get_comms();
-  auto const comm_size = comm.get_size();
-  auto const comm_rank = comm.get_rank();
-
-  rmm::device_uvector<edge_t> temp_input(local_degrees.size(), handle.get_stream());
-  raft::update_device(
-    temp_input.data(), local_degrees.data(), local_degrees.size(), handle.get_stream());
-
-  rmm::device_uvector<edge_t> recv_data(local_degrees.size(), handle.get_stream());
-  if (col_rank == 0) {
-    thrust::fill(handle.get_thrust_policy(), recv_data.begin(), recv_data.end(), edge_t{0});
-  }
-  for (int i = 0; i < col_size - 1; ++i) {
-    if (col_rank == i) {
-      comm.device_send(
-        temp_input.begin(), temp_input.size(), comm_rank + row_size, handle.get_stream());
-    }
-    if (col_rank == i + 1) {
-      comm.device_recv(
-        recv_data.begin(), recv_data.size(), comm_rank - row_size, handle.get_stream());
-      thrust::transform(handle.get_thrust_policy(),
-                        temp_input.begin(),
-                        temp_input.end(),
-                        recv_data.begin(),
-                        temp_input.begin(),
-                        thrust::plus<edge_t>());
-    }
-    col_comm.barrier();
-  }
-  // Get global degrees
-  device_bcast(col_comm,
-               temp_input.begin(),
-               temp_input.begin(),
-               temp_input.size(),
-               col_size - 1,
-               handle.get_stream());
-
-  return std::make_tuple(std::move(recv_data), std::move(temp_input));
-}
-
-template <typename GraphViewType, typename VertexIterator, typename GPUIdIterator>
-std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename std::iterator_traits<GPUIdIterator>::value_type>>
-gather_active_majors(raft::handle_t const& handle,
-                     GraphViewType const& graph_view,
-                     VertexIterator vertex_input_first,
-                     VertexIterator vertex_input_last,
-                     GPUIdIterator gpu_id_first)
-{
-  static_assert(GraphViewType::is_multi_gpu == true);
-  static_assert(GraphViewType::is_storage_transposed == false);
-  using gpu_t    = typename std::iterator_traits<GPUIdIterator>::value_type;
-  using vertex_t = typename GraphViewType::vertex_type;
-
-  auto const& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  size_t source_count  = thrust::distance(vertex_input_first, vertex_input_last);
-  auto external_source_counts =
-    cugraph::host_scalar_allgather(col_comm, source_count, handle.get_stream());
-  auto total_external_source_count =
-    std::accumulate(external_source_counts.begin(), external_source_counts.end(), size_t{0});
-  std::vector<size_t> displacements(external_source_counts.size(), size_t{0});
-  std::exclusive_scan(
-    external_source_counts.begin(), external_source_counts.end(), displacements.begin(), size_t{0});
-
-  rmm::device_uvector<vertex_t> active_majors(total_external_source_count, handle.get_stream());
-  rmm::device_uvector<gpu_t> active_major_gpu_ids(total_external_source_count, handle.get_stream());
-  // Get the sources other gpus on the same row are working on
-  // FIXME : replace with device_bcast for better scaling
-  device_allgatherv(col_comm,
-                    vertex_input_first,
-                    active_majors.data(),
-                    external_source_counts,
-                    displacements,
-                    handle.get_stream());
-  device_allgatherv(col_comm,
-                    gpu_id_first,
-                    active_major_gpu_ids.data(),
-                    external_source_counts,
-                    displacements,
-                    handle.get_stream());
-  thrust::sort_by_key(handle.get_thrust_policy(),
-                      active_majors.begin(),
-                      active_majors.end(),
-                      active_major_gpu_ids.begin());
-  return std::make_tuple(std::move(active_majors), std::move(active_major_gpu_ids));
-}
-
-template <typename GraphViewType>
-rmm::device_uvector<typename GraphViewType::edge_type> get_active_major_global_degrees(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_out_degrees)
-{
-  using vertex_t    = typename GraphViewType::vertex_type;
-  using edge_t      = typename GraphViewType::edge_type;
-  using partition_t = edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                                                   typename GraphViewType::edge_type,
-                                                   typename GraphViewType::weight_type,
-                                                   GraphViewType::is_multi_gpu>;
-  rmm::device_uvector<edge_t> active_major_degrees(active_majors.size(), handle.get_stream());
-
-  std::vector<vertex_t> id_begin;
-  std::vector<vertex_t> id_end;
-  std::vector<vertex_t> count_offsets;
-  id_begin.reserve(graph_view.number_of_local_edge_partitions());
-  id_end.reserve(graph_view.number_of_local_edge_partitions());
-  count_offsets.reserve(graph_view.number_of_local_edge_partitions());
-  vertex_t counter{0};
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto edge_partition = partition_t(graph_view.local_edge_partition_view(i));
-    // Starting vertex ids of each partition
-    id_begin.push_back(edge_partition.major_range_first());
-    id_end.push_back(edge_partition.major_range_last());
-    count_offsets.push_back(counter);
-    counter += edge_partition.major_range_size();
-  }
-  rmm::device_uvector<vertex_t> vertex_id_begin(id_begin.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> vertex_id_end(id_end.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> vertex_count_offsets(count_offsets.size(), handle.get_stream());
-  raft::update_device(
-    vertex_id_begin.data(), id_begin.data(), id_begin.size(), handle.get_stream());
-  raft::update_device(vertex_id_end.data(), id_end.data(), id_end.size(), handle.get_stream());
-  raft::update_device(
-    vertex_count_offsets.data(), count_offsets.data(), count_offsets.size(), handle.get_stream());
-
-  thrust::transform(handle.get_thrust_policy(),
-                    active_majors.begin(),
-                    active_majors.end(),
-                    active_major_degrees.begin(),
-                    [id_begin             = vertex_id_begin.data(),
-                     id_end               = vertex_id_end.data(),
-                     global_out_degrees   = global_out_degrees.data(),
-                     vertex_count_offsets = vertex_count_offsets.data(),
-                     count                = vertex_id_end.size()] __device__(auto v) {
-                      // Find which partition id did the vertex belong to
-                      auto partition_id = thrust::distance(
-                        id_end, thrust::upper_bound(thrust::seq, id_end, id_end + count, v));
-                      // starting position of the segment within global_degree_offset
-                      // where the information for partition (partition_id) starts
-                      //  vertex_count_offsets[partition_id]
-                      // The relative location of offset information for vertex id v within
-                      // the segment
-                      //  v - id_end[partition_id]
-                      auto location_in_segment = v - id_begin[partition_id];
-                      // read location of global_degree_offset needs to take into account the
-                      // partition offsets because it is a concatenation of all the offsets
-                      // across all partitions
-                      auto location = location_in_segment + vertex_count_offsets[partition_id];
-                      return global_out_degrees[location];
-                    });
-  return active_major_degrees;
-}
-
-template <typename GraphViewType>
-std::tuple<rmm::device_uvector<edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                                                            typename GraphViewType::edge_type,
-                                                            typename GraphViewType::weight_type,
-                                                            GraphViewType::is_multi_gpu>>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>>
-partition_information(raft::handle_t const& handle, GraphViewType const& graph_view)
-{
-  using vertex_t    = typename GraphViewType::vertex_type;
-  using edge_t      = typename GraphViewType::edge_type;
-  using partition_t = edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                                                   typename GraphViewType::edge_type,
-                                                   typename GraphViewType::weight_type,
-                                                   GraphViewType::is_multi_gpu>;
-
-  std::vector<partition_t> partitions;
-  std::vector<vertex_t> id_begin;
-  std::vector<vertex_t> id_end;
-  std::vector<vertex_t> hypersparse_begin;
-  std::vector<vertex_t> vertex_count_offsets;
-
-  partitions.reserve(graph_view.number_of_local_edge_partitions());
-  id_begin.reserve(graph_view.number_of_local_edge_partitions());
-  id_end.reserve(graph_view.number_of_local_edge_partitions());
-  hypersparse_begin.reserve(graph_view.number_of_local_edge_partitions());
-  vertex_count_offsets.reserve(graph_view.number_of_local_edge_partitions());
-
-  vertex_t counter{0};
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    partitions.emplace_back(graph_view.local_edge_partition_view(i));
-    auto& edge_partition = partitions.back();
-
-    // Starting vertex ids of each partition
-    id_begin.push_back(edge_partition.major_range_first());
-    id_end.push_back(edge_partition.major_range_last());
-
-    auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
-    auto use_dcs         = segment_offsets
-                             ? ((*segment_offsets).size() > (num_sparse_segments_per_vertex_partition + 1))
-                             : false;
-    if (use_dcs) {
-      auto major_hypersparse_first = edge_partition.major_range_first() +
-                                     (*segment_offsets)[num_sparse_segments_per_vertex_partition];
-      hypersparse_begin.push_back(major_hypersparse_first);
-    } else {
-      hypersparse_begin.push_back(edge_partition.major_range_last());
-    }
-
-    // Count of relative position of the vertices
-    vertex_count_offsets.push_back(counter);
-
-    counter += edge_partition.major_range_size();
-  }
-
-  // Allocate device memory for transfer
-  rmm::device_uvector<partition_t> edge_partitions(graph_view.number_of_local_edge_partitions(),
-                                                   handle.get_stream());
-
-  rmm::device_uvector<vertex_t> major_begin(id_begin.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> minor_end(id_end.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> hs_begin(hypersparse_begin.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> vc_offsets(vertex_count_offsets.size(), handle.get_stream());
-
-  // Transfer data
-  raft::update_device(
-    edge_partitions.data(), partitions.data(), partitions.size(), handle.get_stream());
-  raft::update_device(major_begin.data(), id_begin.data(), id_begin.size(), handle.get_stream());
-  raft::update_device(minor_end.data(), id_end.data(), id_end.size(), handle.get_stream());
-  raft::update_device(vc_offsets.data(),
-                      vertex_count_offsets.data(),
-                      vertex_count_offsets.size(),
-                      handle.get_stream());
-  raft::update_device(
-    hs_begin.data(), hypersparse_begin.data(), hypersparse_begin.size(), handle.get_stream());
-
-  return std::make_tuple(std::move(edge_partitions),
-                         std::move(major_begin),
-                         std::move(minor_end),
-                         std::move(hs_begin),
-                         std::move(vc_offsets));
-}
-
-template <typename GraphViewType, typename gpu_t>
-std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<gpu_t>,
-           rmm::device_uvector<typename GraphViewType::edge_type>>
-gather_local_edges(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
-  const rmm::device_uvector<gpu_t>& active_major_gpu_ids,
-  rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
-  typename GraphViewType::edge_type indices_per_major,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets)
-{
-  static_assert(GraphViewType::is_multi_gpu == true);
-  using vertex_t  = typename GraphViewType::vertex_type;
-  using edge_t    = typename GraphViewType::edge_type;
-  auto edge_count = active_majors.size() * indices_per_major;
-  rmm::device_uvector<vertex_t> majors(edge_count, handle.get_stream());
-  rmm::device_uvector<vertex_t> minors(edge_count, handle.get_stream());
-  rmm::device_uvector<gpu_t> minor_gpu_ids(edge_count, handle.get_stream());
-  vertex_t invalid_vertex_id = graph_view.number_of_vertices();
-
-  auto [partitions, id_begin, id_end, hypersparse_begin, vertex_count_offsets] =
-    partition_information(handle, graph_view);
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<size_t>(0),
-    thrust::make_counting_iterator<size_t>(edge_count),
-    [edge_index_first      = minor_map.begin(),
-     active_majors         = active_majors.data(),
-     active_major_gpu_ids  = active_major_gpu_ids.data(),
-     id_begin              = id_begin.data(),
-     id_end                = id_end.data(),
-     id_seg_count          = id_begin.size(),
-     vertex_count_offsets  = vertex_count_offsets.data(),
-     glbl_degree_offsets   = global_degree_offsets.data(),
-     glbl_adj_list_offsets = global_adjacency_list_offsets.data(),
-     majors                = majors.data(),
-     minors                = minors.data(),
-     dst_gpu_ids           = minor_gpu_ids.data(),
-     partitions            = partitions.data(),
-     hypersparse_begin     = hypersparse_begin.data(),
-     invalid_vertex_id,
-     indices_per_major] __device__(auto index) {
-      // major which this edge index refers to
-      auto loc           = index / indices_per_major;
-      auto major         = active_majors[loc];
-      majors[index]      = major;
-      dst_gpu_ids[index] = active_major_gpu_ids[loc];
-
-      // Find which partition id did the major belong to
-      auto partition_id = thrust::distance(
-        id_end, thrust::upper_bound(thrust::seq, id_end, id_end + id_seg_count, major));
-      // starting position of the segment within global_degree_offset
-      // where the information for partition (partition_id) starts
-      //  vertex_count_offsets[partition_id]
-      // The relative location of offset information for vertex id v within
-      // the segment
-      //  v - seg[partition_id]
-      vertex_t location_in_segment;
-      if (major < hypersparse_begin[partition_id]) {
-        location_in_segment = major - id_begin[partition_id];
-      } else {
-        auto row_hypersparse_idx =
-          partitions[partition_id].major_hypersparse_idx_from_major_nocheck(major);
-        if (row_hypersparse_idx) {
-          location_in_segment = *(row_hypersparse_idx)-id_begin[partition_id];
-        } else {
-          minors[index] = invalid_vertex_id;
-          return;
-        }
-      }
-
-      // csr offset value for vertex v that belongs to partition (partition_id)
-      auto offset_ptr                = partitions[partition_id].offsets();
-      auto sparse_offset             = offset_ptr[location_in_segment];
-      auto local_out_degree          = offset_ptr[location_in_segment + 1] - sparse_offset;
-      vertex_t const* adjacency_list = partitions[partition_id].indices() + sparse_offset;
-      // read location of global_degree_offset needs to take into account the
-      // partition offsets because it is a concatenation of all the offsets
-      // across all partitions
-      auto location        = location_in_segment + vertex_count_offsets[partition_id];
-      auto g_degree_offset = glbl_degree_offsets[location];
-      auto g_dst_index     = edge_index_first[index];
-      if ((g_dst_index >= g_degree_offset) && (g_dst_index < g_degree_offset + local_out_degree)) {
-        minors[index]           = adjacency_list[g_dst_index - g_degree_offset];
-        edge_index_first[index] = g_dst_index - g_degree_offset + glbl_adj_list_offsets[location];
-      } else {
-        minors[index] = invalid_vertex_id;
-      }
-    });
-  auto input_iter = thrust::make_zip_iterator(
-    thrust::make_tuple(majors.begin(), minors.begin(), minor_gpu_ids.begin(), minor_map.begin()));
-
-  auto compacted_length = thrust::distance(
-    input_iter,
-    thrust::remove_if(
-      handle.get_thrust_policy(),
-      input_iter,
-      input_iter + minors.size(),
-      minors.begin(),
-      [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); }));
-  majors.resize(compacted_length, handle.get_stream());
-  minors.resize(compacted_length, handle.get_stream());
-  minor_gpu_ids.resize(compacted_length, handle.get_stream());
-  minor_map.resize(compacted_length, handle.get_stream());
-  return std::make_tuple(
-    std::move(majors), std::move(minors), std::move(minor_gpu_ids), std::move(minor_map));
-}
-
-template <typename GraphViewType, typename VertexIterator>
-typename GraphViewType::edge_type edgelist_count(raft::handle_t const& handle,
-                                                 GraphViewType const& graph_view,
-                                                 VertexIterator vertex_input_first,
-                                                 VertexIterator vertex_input_last)
-{
-  using edge_t = typename GraphViewType::edge_type;
-  // Expect that vertex input list is sorted
-  auto [partitions, id_begin, id_end, hypersparse_begin, vertex_count_offsets] =
-    partition_information(handle, graph_view);
-  return thrust::transform_reduce(
-    handle.get_thrust_policy(),
-    vertex_input_first,
-    vertex_input_last,
-    [partitions           = partitions.data(),
-     id_begin             = id_begin.data(),
-     id_end               = id_end.data(),
-     id_seg_count         = id_begin.size(),
-     hypersparse_begin    = hypersparse_begin.data(),
-     vertex_count_offsets = vertex_count_offsets.data()] __device__(auto major) {
-      // Find which partition id did the vertex belong to
-      auto partition_id = thrust::distance(
-        id_end, thrust::upper_bound(thrust::seq, id_end, id_end + id_seg_count, major));
-      auto edge_partition          = partitions[partition_id];
-      auto major_hypersparse_first = hypersparse_begin[partition_id];
-      if (major < major_hypersparse_first) {
-        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-        return edge_partition.local_degree(major_offset);
-      } else {
-        auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-        return major_hypersparse_idx
-                 ? edge_partition.local_degree(
-                     edge_partition.major_offset_from_major_nocheck(major_hypersparse_first) +
-                     *major_hypersparse_idx)
-                 : edge_t{0};
-      }
-    },
-    edge_t{0},
-    thrust::plus<edge_t>());
-}
-
-template <typename vertex_t>
-std::vector<vertex_t> get_active_major_segments(raft::handle_t const& handle,
-                                                vertex_t major_range_first,
-                                                vertex_t major_range_last,
-                                                std::vector<vertex_t> const& partition_segments,
-                                                const rmm::device_uvector<vertex_t>& active_majors)
-{
-  std::vector<vertex_t> segments(partition_segments.size());
-  std::transform(partition_segments.begin(),
-                 partition_segments.end(),
-                 segments.begin(),
-                 [major_range_first](auto s) { return s + major_range_first; });
-  segments.push_back(major_range_last);
-
-  rmm::device_uvector<vertex_t> p_segments(segments.size(), handle.get_stream());
-  raft::update_device(p_segments.data(), segments.data(), segments.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> majors_segments(segments.size(), handle.get_stream());
-  thrust::lower_bound(handle.get_thrust_policy(),
-                      active_majors.cbegin(),
-                      active_majors.cend(),
-                      p_segments.begin(),
-                      p_segments.end(),
-                      majors_segments.begin());
-  std::vector<vertex_t> active_majors_segments(majors_segments.size());
-  raft::update_host(active_majors_segments.data(),
-                    majors_segments.data(),
-                    majors_segments.size(),
-                    handle.get_stream());
-  return active_majors_segments;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void local_major_degree(
-  raft::handle_t const& handle,
-  edge_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> partition,
-  rmm::device_uvector<vertex_t> const& active_majors,
-  std::vector<vertex_t> const& majors_segments,
-  std::vector<vertex_t> const& partition_segments,
-  edge_t* out_degrees)
-{
-  auto active_major_count = majors_segments.back() - majors_segments.front();
-  // Sparse region
-  if (majors_segments[3] - majors_segments[0] > 0) {
-    thrust::transform(handle.get_thrust_policy(),
-                      active_majors.cbegin() + majors_segments[0],
-                      active_majors.cbegin() + majors_segments[3],
-                      out_degrees,
-                      [partition] __device__(auto major) {
-                        auto major_offset = partition.major_offset_from_major_nocheck(major);
-                        return partition.local_degree(major_offset);
-                      });
-  }
-  // Hypersparse region
-  if (majors_segments[4] - majors_segments[3] > 0) {
-    auto major_hypersparse_first =
-      partition.major_range_first() +
-      partition_segments[detail::num_sparse_segments_per_vertex_partition];
-    auto major_offset =
-      static_cast<size_t>(major_hypersparse_first - partition.major_range_first());
-    thrust::transform(handle.get_thrust_policy(),
-                      active_majors.cbegin() + majors_segments[3],
-                      active_majors.cbegin() + majors_segments[4],
-                      out_degrees + majors_segments[3] - majors_segments[0],
-                      [partition, major_offset] __device__(auto major) {
-                        auto major_idx = partition.major_hypersparse_idx_from_major_nocheck(major);
-                        if (major_idx) {
-                          return partition.local_degree(major_offset + *major_idx);
-                        } else {
-                          return edge_t{0};
-                        }
-                      });
-  }
-}
-
-template <typename GraphViewType, typename prop_t>
-std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<typename GraphViewType::vertex_type>,
-           rmm::device_uvector<prop_t>,
-           rmm::device_uvector<typename GraphViewType::edge_type>>
-gather_one_hop_edgelist(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
-  const rmm::device_uvector<prop_t>& active_major_property,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets)
-{
-  // Assumes active_majors is sorted
-
-  static_assert(GraphViewType::is_multi_gpu == true);
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using weight_t = typename GraphViewType::weight_type;
-
-  std::vector<std::vector<vertex_t>> active_majors_segments;
-  vertex_t max_active_major_count{0};
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto partition =
-      edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-    // Identify segments of active_majors
-    active_majors_segments.emplace_back(
-      get_active_major_segments(handle,
-                                partition.major_range_first(),
-                                partition.major_range_last(),
-                                *(graph_view.local_edge_partition_segment_offsets(i)),
-                                active_majors));
-    auto& majors_segments = active_majors_segments.back();
-    // Count of active majors belonging to this partition
-    max_active_major_count =
-      std::max(max_active_major_count, majors_segments.back() - majors_segments.front());
-  }
-
-  auto& comm           = handle.get_comms();
-  auto const comm_rank = comm.get_rank();
-  rmm::device_uvector<edge_t> active_majors_out_offsets(1 + max_active_major_count,
-                                                        handle.get_stream());
-  auto edge_count = edgelist_count(handle, graph_view, active_majors.begin(), active_majors.end());
-  rmm::device_uvector<vertex_t> majors(edge_count, handle.get_stream());
-  rmm::device_uvector<vertex_t> minors(edge_count, handle.get_stream());
-  rmm::device_uvector<prop_t> minor_prop_ids(edge_count, handle.get_stream());
-  rmm::device_uvector<edge_t> minor_map(edge_count, handle.get_stream());
-
-  edge_t output_offset = 0;
-  vertex_t vertex_offset{0};
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto partition =
-      edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-    auto& majors_segments = active_majors_segments[i];
-    // Calculate local degree offsets
-    auto active_major_count = majors_segments.back() - majors_segments.front();
-    active_majors_out_offsets.set_element_to_zero_async(0, handle.get_stream());
-    local_major_degree(handle,
-                       partition,
-                       active_majors,
-                       majors_segments,
-                       *(graph_view.local_edge_partition_segment_offsets(i)),
-                       1 + active_majors_out_offsets.data());
-    thrust::inclusive_scan(handle.get_thrust_policy(),
-                           active_majors_out_offsets.begin() + 1,
-                           active_majors_out_offsets.begin() + 1 + active_major_count,
-                           active_majors_out_offsets.begin() + 1);
-    active_majors_out_offsets.resize(1 + active_major_count, handle.get_stream());
-    partially_decompress_edge_partition_to_fill_edgelist<vertex_t,
-                                                         edge_t,
-                                                         weight_t,
-                                                         prop_t,
-                                                         GraphViewType::is_multi_gpu>(
-      handle,
-      partition,
-      active_majors.cbegin(),
-      active_majors_out_offsets.cbegin(),
-      majors_segments,
-      output_offset + majors.data(),
-      output_offset + minors.data(),
-      thrust::nullopt,
-      thrust::make_optional(
-        thrust::make_tuple(active_major_property.cbegin(), output_offset + minor_prop_ids.data())),
-      thrust::make_optional(
-        thrust::make_tuple(global_adjacency_list_offsets.cbegin() + vertex_offset,
-                           output_offset + minor_map.begin())));
-    output_offset += active_majors_out_offsets.back_element(handle.get_stream());
-    vertex_offset += partition.major_range_size();
-  }
-
-  return std::make_tuple(
-    std::move(majors), std::move(minors), std::move(minor_prop_ids), std::move(minor_map));
-}
-
-}  // namespace original
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/sampling/detail/graph_functions.hpp b/cpp/src/sampling/detail/graph_functions.hpp
index f0b1580b88e..8eef9c83d61 100644
--- a/cpp/src/sampling/detail/graph_functions.hpp
+++ b/cpp/src/sampling/detail/graph_functions.hpp
@@ -150,8 +150,7 @@ gather_local_edges(
   const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
   rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
   typename GraphViewType::edge_type indices_per_major,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets);
+  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets);
 
 /**
  * @brief Gather edge list for specified vertices
diff --git a/cpp/src/sampling/detail/sampling_utils_impl.cuh b/cpp/src/sampling/detail/sampling_utils_impl.cuh
index 478f75095c1..793df64a8d6 100644
--- a/cpp/src/sampling/detail/sampling_utils_impl.cuh
+++ b/cpp/src/sampling/detail/sampling_utils_impl.cuh
@@ -135,73 +135,6 @@ rmm::device_uvector<typename GraphViewType::edge_type> compute_local_major_degre
   return local_degrees;
 }
 
-template <typename GraphViewType>
-rmm::device_uvector<typename GraphViewType::edge_type> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_out_degrees)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using weight_t = typename GraphViewType::weight_type;
-
-  rmm::device_uvector<edge_t> global_adjacency_list_offsets(global_degree_offsets.size(),
-                                                            handle.get_stream());
-
-  if constexpr (GraphViewType::is_multi_gpu) {
-    auto& comm           = handle.get_comms();
-    auto const comm_size = comm.get_size();
-    auto const comm_rank = comm.get_rank();
-    auto& col_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-    auto const col_size  = col_comm.get_size();
-    auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_rank  = row_comm.get_rank();
-    auto const row_size  = row_comm.get_size();
-
-    edge_t edge_count_in_all_previous_partitions{0};
-    vertex_t vertex_offset{0};
-    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-      auto edge_partition =
-        edge_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
-          graph_view.local_edge_partition_view(i));
-      auto edge_counts =
-        cugraph::host_scalar_allgather(comm, edge_partition.number_of_edges(), handle.get_stream());
-      edge_t partial_edge_count{0};
-      for (int r = 0; r < row_rank; ++r) {
-        for (int c = 0; c < col_size; ++c) {
-          partial_edge_count += edge_counts[r + c * row_size];
-        }
-      }
-      thrust::exclusive_scan(
-        handle.get_thrust_policy(),
-        global_out_degrees.cbegin() + vertex_offset,
-        global_out_degrees.cbegin() + vertex_offset + edge_partition.major_range_size(),
-        global_adjacency_list_offsets.begin() + vertex_offset);
-
-      thrust::transform(
-        handle.get_thrust_policy(),
-        global_adjacency_list_offsets.cbegin() + vertex_offset,
-        global_adjacency_list_offsets.cbegin() + vertex_offset + edge_partition.major_range_size(),
-        global_degree_offsets.cbegin() + vertex_offset,
-        global_adjacency_list_offsets.begin() + vertex_offset,
-        [offset = edge_count_in_all_previous_partitions + partial_edge_count] __device__(
-          auto val0, auto val1) { return val0 + val1 + offset; });
-
-      edge_count_in_all_previous_partitions +=
-        std::accumulate(edge_counts.begin(), edge_counts.end(), edge_t{0});
-      vertex_offset += edge_partition.major_range_size();
-    }
-  } else {
-    thrust::fill(handle.get_thrust_policy(),
-                 global_adjacency_list_offsets.begin(),
-                 global_adjacency_list_offsets.end(),
-                 edge_t{0});
-  }
-
-  return global_adjacency_list_offsets;
-}
-
 template <typename GraphViewType>
 std::tuple<rmm::device_uvector<typename GraphViewType::edge_type>,
            rmm::device_uvector<typename GraphViewType::edge_type>>
@@ -268,8 +201,10 @@ rmm::device_uvector<vertex_t> allgather_active_majors(raft::handle_t const& hand
 {
   auto const& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
   size_t source_count  = d_in.size();
+
   auto external_source_counts =
     cugraph::host_scalar_allgather(col_comm, source_count, handle.get_stream());
+
   auto total_external_source_count =
     std::accumulate(external_source_counts.begin(), external_source_counts.end(), size_t{0});
   std::vector<size_t> displacements(external_source_counts.size(), size_t{0});
@@ -453,8 +388,7 @@ gather_local_edges(
   const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
   rmm::device_uvector<typename GraphViewType::edge_type>&& minor_map,
   typename GraphViewType::edge_type indices_per_major,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets,
-  const rmm::device_uvector<typename GraphViewType::edge_type>& global_adjacency_list_offsets)
+  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets)
 {
   using vertex_t  = typename GraphViewType::vertex_type;
   using edge_t    = typename GraphViewType::edge_type;
@@ -479,19 +413,18 @@ gather_local_edges(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator<size_t>(0),
       thrust::make_counting_iterator<size_t>(edge_count),
-      [edge_index_first      = minor_map.begin(),
-       active_majors         = active_majors.data(),
-       id_begin              = id_begin.data(),
-       id_end                = id_end.data(),
-       id_seg_count          = id_begin.size(),
-       vertex_count_offsets  = vertex_count_offsets.data(),
-       glbl_degree_offsets   = global_degree_offsets.data(),
-       glbl_adj_list_offsets = global_adjacency_list_offsets.data(),
-       majors                = majors.data(),
-       minors                = minors.data(),
-       weights               = weights ? weights->data() : nullptr,
-       partitions            = partitions.data(),
-       hypersparse_begin     = hypersparse_begin.data(),
+      [edge_index_first     = minor_map.begin(),
+       active_majors        = active_majors.data(),
+       id_begin             = id_begin.data(),
+       id_end               = id_end.data(),
+       id_seg_count         = id_begin.size(),
+       vertex_count_offsets = vertex_count_offsets.data(),
+       glbl_degree_offsets  = global_degree_offsets.data(),
+       majors               = majors.data(),
+       minors               = minors.data(),
+       weights              = weights ? weights->data() : nullptr,
+       partitions           = partitions.data(),
+       hypersparse_begin    = hypersparse_begin.data(),
        invalid_vertex_id,
        indices_per_major] __device__(auto index) {
         // major which this edge index refers to
@@ -510,7 +443,6 @@ gather_local_edges(
         if (major < hypersparse_begin[partition_id]) {
           location_in_segment = major - id_begin[partition_id];
           local_out_degree = offset_ptr[location_in_segment + 1] - offset_ptr[location_in_segment];
-          ;
         } else {
           auto row_hypersparse_idx =
             partitions[partition_id].major_hypersparse_idx_from_major_nocheck(major);
@@ -520,7 +452,6 @@ gather_local_edges(
               (hypersparse_begin[partition_id] - id_begin[partition_id]) + *row_hypersparse_idx;
             local_out_degree =
               offset_ptr[location_in_segment + 1] - offset_ptr[location_in_segment];
-            ;
           }
         }
 
@@ -855,7 +786,10 @@ gather_one_hop_edgelist(
         output_offset + minors.data(),
         weights ? thrust::make_optional(output_offset + weights->data()) : thrust::nullopt,
         thrust::nullopt,
-        thrust::nullopt);
+        thrust::nullopt,
+        // FIXME: When PR 2365 is merged, this parameter can be removed
+        graph_view.local_edge_partition_segment_offsets(i));
+
       output_offset += active_majors_out_offsets.back_element(handle.get_stream());
       vertex_offset += partition.major_range_size();
     }
@@ -911,7 +845,9 @@ gather_one_hop_edgelist(
       minors.data(),
       weights ? thrust::make_optional(weights->data()) : thrust::nullopt,
       thrust::nullopt,
-      thrust::nullopt);
+      thrust::nullopt,
+      // FIXME: When PR 2365 is merged, this parameter can be removed
+      std::nullopt);
   }
 
   return std::make_tuple(std::move(majors), std::move(minors), std::move(weights));
diff --git a/cpp/src/sampling/detail/sampling_utils_mg.cu b/cpp/src/sampling/detail/sampling_utils_mg.cu
index ffcead02cf9..726309e5370 100644
--- a/cpp/src/sampling/detail/sampling_utils_mg.cu
+++ b/cpp/src/sampling/detail/sampling_utils_mg.cu
@@ -46,42 +46,6 @@ get_global_degree_information(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, double, false, true> const& graph_view);
 
-template rmm::device_uvector<int32_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
-  rmm::device_uvector<int32_t> const& global_degree_offsets,
-  rmm::device_uvector<int32_t> const& global_out_degrees);
-
-template rmm::device_uvector<int32_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
-  rmm::device_uvector<int32_t> const& global_degree_offsets,
-  rmm::device_uvector<int32_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
 template rmm::device_uvector<int32_t> allgather_active_majors(raft::handle_t const& handle,
                                                               rmm::device_uvector<int32_t>&& d_in);
 
@@ -186,8 +150,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets,
-                   const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -197,8 +160,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -208,8 +170,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -219,8 +180,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets,
-                   const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -230,8 +190,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -241,8 +200,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
diff --git a/cpp/src/sampling/detail/sampling_utils_sg.cu b/cpp/src/sampling/detail/sampling_utils_sg.cu
index 52f2f9245b9..ae2980e5f10 100644
--- a/cpp/src/sampling/detail/sampling_utils_sg.cu
+++ b/cpp/src/sampling/detail/sampling_utils_sg.cu
@@ -49,42 +49,6 @@ get_global_degree_information(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, double, false, false> const& graph_view);
 
-template rmm::device_uvector<int32_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
-  rmm::device_uvector<int32_t> const& global_degree_offsets,
-  rmm::device_uvector<int32_t> const& global_out_degrees);
-
-template rmm::device_uvector<int32_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
-  rmm::device_uvector<int32_t> const& global_degree_offsets,
-  rmm::device_uvector<int32_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
-template rmm::device_uvector<int64_t> get_global_adjacency_offset(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
-  rmm::device_uvector<int64_t> const& global_degree_offsets,
-  rmm::device_uvector<int64_t> const& global_out_degrees);
-
 template rmm::device_uvector<int32_t> get_active_major_global_degrees(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
@@ -129,8 +93,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets,
-                   const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -140,8 +103,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -151,8 +113,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -162,8 +123,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int32_t>&& minor_map,
                    int32_t indices_per_major,
-                   const rmm::device_uvector<int32_t>& global_degree_offsets,
-                   const rmm::device_uvector<int32_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int32_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -173,8 +133,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int32_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -184,8 +143,7 @@ gather_local_edges(raft::handle_t const& handle,
                    const rmm::device_uvector<int64_t>& active_majors,
                    rmm::device_uvector<int64_t>&& minor_map,
                    int64_t indices_per_major,
-                   const rmm::device_uvector<int64_t>& global_degree_offsets,
-                   const rmm::device_uvector<int64_t>& global_adjacency_list_offsets);
+                   const rmm::device_uvector<int64_t>& global_degree_offsets);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
diff --git a/cpp/src/sampling/nbr_sampling_impl.cuh b/cpp/src/sampling/nbr_sampling_impl.cuh
deleted file mode 100644
index 7b0402057ad..00000000000
--- a/cpp/src/sampling/nbr_sampling_impl.cuh
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Andrei Schaffer, aschaffer@nvidia.com
-//
-#pragma once
-
-#include <cugraph/detail/graph_functions.cuh>
-#include <cugraph/graph.hpp>
-#include <utilities/graph_utils.cuh>
-
-#include <cugraph/utilities/shuffle_comm.cuh>
-
-#include <raft/device_atomics.cuh>
-#include <raft/handle.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/adjacent_difference.h>
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-#include <thrust/tabulate.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-#include "rw_traversals.hpp"
-
-#include <cugraph-ops/graph/sampling.hpp>
-
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <type_traits>
-#include <vector>
-
-namespace cugraph {
-namespace detail {
-namespace original {
-
-/**
- * @brief Projects zip input onto the lower dim zip output, where lower dimension components are
- * specified by tuple indices; e.g., extracts the (destination_vertex_id, rank_to_send_it_to)
- * components from the quadruplet (vertex_t source_vertex, vertex_t destination_vertex, int rank,
- * edge_t index) via indices {1,2};
- * @tparam vertex_index non-type template parameter specifying index in the input tuple where vertex
- * IDs are stored;
- * @tparam rank_index non-type template parameter specifying index in the input tuple where rank IDs
- * are stored;
- * @tparam zip_in_it_t zip Type for the input tuple;
- * @tparam zip_out_it_t zip Type for the output tuple;
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param[in] begin zip begin iterator of quadruplets from which new input is extracted; typically
- * (vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index)
- * @param[in] end zip end iterator of quadruplets from which new input is extracted;
- * @param[out] result begin of result zip iterator of pairs for next iteration; typically
- * (vertex_t source_vertex, int rank)
- */
-template <size_t vertex_index, size_t rank_index, typename zip_in_it_t, typename zip_out_it_t>
-void project(raft::handle_t const& handle, zip_in_it_t begin, zip_in_it_t end, zip_out_it_t result)
-{
-  thrust::transform(handle.get_thrust_policy(), begin, end, result, [] __device__(auto const& tpl) {
-    return thrust::make_tuple(thrust::get<vertex_index>(tpl), thrust::get<rank_index>(tpl));
-  });
-}
-
-/**
- * @brief Shuffles zipped pairs of vertex IDs and ranks IDs to the GPU's that the vertex IDs belong
- * to. The assumption is that the return provides a per-GPU coalesced set of pairs, with
- * corresponding counts vector. To limit the result to the self-GPU one needs additional filtering
- * to extract the corresponding set from the coalesced set of sets and using the corresponding
- * counts entry.
- * @tparam graph_view_t Type of graph view.
- * @tparam zip_iterator_t zip Type for the zipped tuple<vertex_t, gpu_t> (vertexID, rank);
- * @tparam gpu_t Type used for storing GPU rank IDs;
- * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
- * and handles to various CUDA libraries) to run graph algorithms.
- * @param[in] graph_view Graph View object to generate NBR Sampling on.
- * @param[in] begin zip begin iterator of (vertexID, rank) pairs.
- * @param[in] end zip end iterator of (vertexID, rank) pairs.
- * @param[in] unnamed tag used for template tag dispatching
- * @return tuple pair of coalesced pairs and counts
- */
-template <typename graph_view_t, typename zip_iterator_t, typename gpu_t>
-std::tuple<std::tuple<device_vec_t<typename graph_view_t::vertex_type>, device_vec_t<gpu_t>>,
-           std::vector<size_t>>
-shuffle_to_gpus(raft::handle_t const& handle,
-                graph_view_t const& graph_view,
-                zip_iterator_t begin,
-                zip_iterator_t end,
-                gpu_t)
-{
-  using vertex_t = typename graph_view_t::vertex_type;
-  using edge_t   = typename graph_view_t::edge_type;
-
-  auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
-  device_vec_t<vertex_t> d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(),
-                                                        handle.get_stream());
-  raft::update_device(d_vertex_partition_range_lasts.data(),
-                      vertex_partition_range_lasts.data(),
-                      vertex_partition_range_lasts.size(),
-                      handle.get_stream());
-
-  return groupby_gpu_id_and_shuffle_values(
-    handle.get_comms(),
-    begin,
-    end,
-    [vertex_partition_range_lasts = d_vertex_partition_range_lasts.data(),
-     num_vertex_partitions = d_vertex_partition_range_lasts.size()] __device__(auto tpl_v_r) {
-      return static_cast<gpu_t>(
-        thrust::distance(vertex_partition_range_lasts,
-                         thrust::lower_bound(thrust::seq,
-                                             vertex_partition_range_lasts,
-                                             vertex_partition_range_lasts + num_vertex_partitions,
-                                             thrust::get<0>(tpl_v_r))));
-    },
-    handle.get_stream());
-}
-
-/**
- * @brief Updates pair of vertex IDs and ranks IDs to the GPU's that the vertex IDs belong
- * to.
- * @tparam graph_view_t Type of graph view.
- * @tparam zip_iterator_t zip Type for the zipped tuple<vertex_t, gpu_t> (vertexID, rank).
- * @tparam gpu_t Type used for storing GPU rank IDs;
- * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
- * and handles to various CUDA libraries) to run graph algorithms.
- * @param[in] graph_view Graph View object to generate NBR Sampling on.
- * @param[in] begin zip begin iterator of (vertexID, rank) pairs.
- * @param[in] end zip end iterator of (vertexID, rank) pairs.
- * @param[in] rank for which data is to be extracted.
- * @param[out] d_in vertex set to be updated.
- * @param[out] d_ranks corresponding rank set to be updated.
- * @param[in] unnamed tag used for template tag dispatching.
- */
-template <typename graph_view_t, typename zip_iterator_t, typename gpu_t>
-void update_input_by_rank(raft::handle_t const& handle,
-                          graph_view_t const& graph_view,
-                          zip_iterator_t begin,
-                          zip_iterator_t end,
-                          size_t rank,
-                          device_vec_t<typename graph_view_t::vertex_type>& d_in,
-                          device_vec_t<gpu_t>& d_ranks,
-                          gpu_t)
-{
-  auto&& [rx_tpl_v_r, rx_counts] =
-    detail::original::shuffle_to_gpus(handle, graph_view, begin, end, gpu_t{});
-
-  // filter rx_tpl_v_r and rx_counts vector by rank:
-  //
-  decltype(rx_counts) rx_offsets(rx_counts.size());
-  std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_offsets.begin(), 0);
-
-  // resize d_in, d_ranks:
-  //
-  auto new_in_sz = rx_counts.at(rank);
-  d_in.resize(new_in_sz, handle.get_stream());
-  d_ranks.resize(new_in_sz, handle.get_stream());
-
-  // project output onto input:
-  // zip d_in, d_ranks
-  //
-  auto new_in_zip = thrust::make_zip_iterator(
-    thrust::make_tuple(d_in.begin(), d_ranks.begin()));  // result start_zip
-
-  auto&& d_new_dests = std::get<0>(rx_tpl_v_r);
-  auto&& d_new_ranks = std::get<1>(rx_tpl_v_r);
-  auto offset        = rx_offsets.at(rank);
-
-  auto tpl_in_it_begin = thrust::make_zip_iterator(
-    thrust::make_tuple(d_new_dests.begin() + offset, d_new_ranks.begin() + offset));
-  project<0, 1>(handle, tpl_in_it_begin, tpl_in_it_begin + new_in_sz, new_in_zip);
-}
-
-/**
- * @brief Shuffles zipped tuples of (vertex_t source_vertex, vertex_t destination_vertex, int rank,
- * index_t index) to specified target GPU's.
- * @tparam vertex_t Type of vertex IDs.
- * @tparam gpu_t Type used for storing GPU rank IDs.
- * @tparam index_t Type used for indexing; typically edge_t.
- * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
- * and handles to various CUDA libraries) to run graph algorithms.
- * @param[in] d_src source vertex IDs; shuffle prims require it be mutable.
- * @param[in] d_dst destination vertex IDs; must be mutable.
- * @param[in] d_gpu_id_keys target GPU IDs (ranks); must be mutable.
- * @param[in] d_indices indices of destination vertices; must be mutable.
- * @return tuple of tuple of device vectors and counts:
- * ((vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index), rx_counts)
- */
-template <typename vertex_t, typename gpu_t, typename index_t>
-std::tuple<std::tuple<device_vec_t<vertex_t>,
-                      device_vec_t<vertex_t>,
-                      device_vec_t<gpu_t>,
-                      device_vec_t<index_t>>,
-           std::vector<size_t>>
-shuffle_to_target_gpu_ids(raft::handle_t const& handle,
-                          device_vec_t<vertex_t>& d_src,
-                          device_vec_t<vertex_t>& d_dst,
-                          device_vec_t<gpu_t>& d_gpu_id_keys,
-                          device_vec_t<index_t>& d_indices)
-{
-  auto zip_it_begin =
-    thrust::make_zip_iterator(thrust::make_tuple(d_src.begin(), d_dst.begin(), d_indices.begin()));
-
-  thrust::sort_by_key(
-    handle.get_thrust_policy(), d_gpu_id_keys.begin(), d_gpu_id_keys.end(), zip_it_begin);
-
-  rmm::device_uvector<size_t> tx_counts(handle.get_comms().get_size(), handle.get_stream());
-
-  thrust::tabulate(
-    handle.get_thrust_policy(),
-    tx_counts.begin(),
-    tx_counts.end(),
-    [gpu_id_first = d_gpu_id_keys.begin(), gpu_id_last = d_gpu_id_keys.end()] __device__(size_t i) {
-      return static_cast<size_t>(thrust::distance(
-        gpu_id_first,
-        thrust::upper_bound(thrust::seq, gpu_id_first, gpu_id_last, static_cast<gpu_t>(i))));
-    });
-
-  thrust::adjacent_difference(
-    handle.get_thrust_policy(), tx_counts.begin(), tx_counts.end(), tx_counts.begin());
-
-  std::vector<size_t> h_tx_counts(tx_counts.size());
-  raft::update_host(h_tx_counts.data(), tx_counts.data(), tx_counts.size(), handle.get_stream());
-
-  handle.sync_stream();
-
-  return  // [rx_tuple, rx_counts]
-    shuffle_values(handle.get_comms(),
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     d_src.begin(), d_dst.begin(), d_gpu_id_keys.begin(), d_indices.begin())),
-                   h_tx_counts,
-                   handle.get_stream());
-}
-
-/**
- * @brief Multi-GPU Uniform Neighborhood Sampling. The outline of the algorithm:
- *
- * uniform_nbr_sample(J[p][], L, K[], flag_unique) {
- *   Out[p][] = {};                                              // initialize output result
- * (empty)
- *
- *  loop level in {0,…, L-1} {                                   // 1 tree level / iteration
- *       n_per_level = |J| * L^ (level+1);                       // size of output per level
- *
- *       J[] = union(J[], {J[partition_row],
- *                        for partition_row same as `p`};
- *
- *      for each pair (s, _) in J[] {                            // cache out-degrees of src_v
- * set; d_out_deg[s] = mnmg_get_out_deg(graph, s);
- *      }
- *
- *      d_indices[] = segmented_random_generator(d_out_degs[],   // sizes[] to define range to
- *                                                               // sample from;
- *                                               K[level],       // fanout per-level
- *                                               flag_unique);
- *                                                               // for each (s, _) in J[]{
- *                                                               //   generate {0,…,out-deg(s)};}
- *
- *     d_out[] = gather_nbr(J[], d_indices[], level, K[level]);  // {(s, d, r),…} MNMG prim that
- *                                                               // gathers the NBR for current
- *                                                               // level of each src_v;
- *                                                               // output is set of triplets
- *                                                               // (src_v, dst_v,
- * rank_to_send_to) Out[p][] = union(Out[p][], d_out[]);                      // append local
- * output to result d_out[] = shuffle(d_out[]);                               // reshuffle output
- * to
- *                                                               // corresponding rank
- *     J[] = project(d_out[], []((s,d,r)){ return (d,r);});      // extract the (d, r) from (s,d,
- * r)
- *                                                               // for next iter
- *    }
- *    return Out[p][];
- * }
- *
- * @tparam graph_view_t Type of graph view.
- * @tparam gpu_t Type used for storing GPU rank IDs;
- * @tparam index_t Type used for indexing; typically edge_t.
- * @tparam seeder_t Type for generating random engine seeds.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Graph View object to generate NBR Sampling on.
- * @param d_in Device vector of starting vertex IDs for the NBR Sampling. Must be non-const for
- * shuffling.
- * @param d_ranks Device vector of ranks for which corresponding vertex ID data must be sent to. The
- * pairs (vertex_ID, rank) must be shuffled together. Must be non-const for shuffling.
- * @param h_fan_out vector of branching out (fan-out) degree per source vertex for each level
- * @param global_degree_offsets local partition of global out-degree cache; pass-through
- * parameter used for obtaining local out-degree information
- * @param flag_replacement boolean flag specifying if random sampling is done without replacement
- * (true); or, with replacement (false); default = true;
- * @return tuple of device vectors:
- * (vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index)
- */
-template <typename graph_view_t,
-          typename gpu_t,
-          typename index_t  = typename graph_view_t::edge_type,
-          typename seeder_t = cugraph::detail::original::clock_seeding_t<uint64_t>>
-std::tuple<device_vec_t<typename graph_view_t::vertex_type>,
-           device_vec_t<typename graph_view_t::vertex_type>,
-           device_vec_t<gpu_t>,
-           device_vec_t<index_t>>
-uniform_nbr_sample_impl(
-  raft::handle_t const& handle,
-  graph_view_t const& graph_view,
-  device_vec_t<typename graph_view_t::vertex_type>& d_in,
-  device_vec_t<gpu_t>& d_ranks,
-  std::vector<int> const& h_fan_out,
-  device_vec_t<typename graph_view_t::edge_type> const& global_out_degrees,
-  device_vec_t<typename graph_view_t::edge_type> const& global_degree_offsets,
-  device_vec_t<typename graph_view_t::edge_type> const& global_adjacency_list_offsets,
-  bool flag_replacement)
-{
-  using vertex_t        = typename graph_view_t::vertex_type;
-  using edge_t          = typename graph_view_t::edge_type;
-  using return_t        = std::tuple<device_vec_t<vertex_t>,
-                              device_vec_t<vertex_t>,
-                              device_vec_t<gpu_t>,
-                              device_vec_t<index_t>>;
-  namespace cugraph_ops = cugraph::ops::gnn::graph;
-
-  if constexpr (graph_view_t::is_multi_gpu) {
-    size_t num_starting_vs = d_in.size();
-
-    CUGRAPH_EXPECTS(num_starting_vs == d_ranks.size(),
-                    "Sets of input vertices and ranks must have same sizes.");
-
-    auto num_levels = h_fan_out.size();
-
-    CUGRAPH_EXPECTS(num_levels > 0, "Invalid input argument: number of levels must be non-zero.");
-
-    // Output quad of accumulators to collect results into:
-    // (all start as empty)
-    //
-    device_vec_t<vertex_t> d_acc_src(0, handle.get_stream());
-    device_vec_t<vertex_t> d_acc_dst(0, handle.get_stream());
-    device_vec_t<gpu_t> d_acc_ranks(0, handle.get_stream());
-    device_vec_t<index_t> d_acc_indices(0, handle.get_stream());
-
-    auto&& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto&& row_rank = row_comm.get_rank();
-
-    auto&& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-    auto&& col_rank = col_comm.get_rank();
-
-    auto const self_rank = handle.get_comms().get_rank();
-
-    size_t level{0l};
-    for (auto&& k_level : h_fan_out) {
-      // prep step for extracting out-degs(sources):
-      //
-      auto&& [d_new_in, d_new_rank] =
-        gather_active_majors(handle, graph_view, d_in.cbegin(), d_in.cend(), d_ranks.cbegin());
-
-      rmm::device_uvector<vertex_t> d_out_src(0, handle.get_stream());
-      rmm::device_uvector<vertex_t> d_out_dst(0, handle.get_stream());
-      rmm::device_uvector<gpu_t> d_out_ranks(0, handle.get_stream());
-      rmm::device_uvector<edge_t> d_indices(0, handle.get_stream());
-
-      if (k_level != 0) {
-        // extract out-degs(sources):
-        //
-        auto&& d_out_degs =
-          get_active_major_global_degrees(handle, graph_view, d_new_in, global_out_degrees);
-
-        // segemented-random-generation of indices:
-        //
-        device_vec_t<edge_t> d_rnd_indices(d_new_in.size() * k_level, handle.get_stream());
-
-        raft::random::RngState rng_state(row_rank + level);
-        cugraph_ops::get_sampling_index(detail::original::raw_ptr(d_rnd_indices),
-                                        rng_state,
-                                        detail::original::raw_const_ptr(d_out_degs),
-                                        static_cast<edge_t>(d_out_degs.size()),
-                                        static_cast<int32_t>(k_level),
-                                        flag_replacement,
-                                        handle.get_stream());
-
-        // gather edges step:
-        // invalid entries (not found, etc.) filtered out in result;
-        // d_indices[] filtered out in-place (to avoid copies+moves);
-        //
-        auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] =
-          gather_local_edges(handle,
-                             graph_view,
-                             d_new_in,
-                             d_new_rank,
-                             std::move(d_rnd_indices),
-                             static_cast<edge_t>(k_level),
-                             global_degree_offsets,
-                             global_adjacency_list_offsets);
-        d_out_src   = std::move(temp_d_out_src);
-        d_out_dst   = std::move(temp_d_out_dst);
-        d_out_ranks = std::move(temp_d_out_ranks);
-        d_indices   = std::move(temp_d_indices);
-      } else {
-        auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] =
-          gather_one_hop_edgelist(
-            handle, graph_view, d_new_in, d_new_rank, global_adjacency_list_offsets);
-        d_out_src   = std::move(temp_d_out_src);
-        d_out_dst   = std::move(temp_d_out_dst);
-        d_out_ranks = std::move(temp_d_out_ranks);
-        d_indices   = std::move(temp_d_indices);
-      }
-
-      // resize accumulators:
-      //
-      auto old_sz = d_acc_dst.size();
-      auto add_sz = d_out_dst.size();
-      auto new_sz = old_sz + add_sz;
-
-      d_acc_src.resize(new_sz, handle.get_stream());
-      d_acc_dst.resize(new_sz, handle.get_stream());
-      d_acc_ranks.resize(new_sz, handle.get_stream());
-      d_acc_indices.resize(new_sz, handle.get_stream());
-
-      // zip quad; must be done after resizing,
-      // because they grow from one iteration to another,
-      // so iterators could be invalidated:
-      //
-      auto acc_zip_it =
-        thrust::make_zip_iterator(thrust::make_tuple(d_acc_src.begin() + old_sz,
-                                                     d_acc_dst.begin() + old_sz,
-                                                     d_acc_ranks.begin() + old_sz,
-                                                     d_acc_indices.begin() + old_sz));
-
-      // union step:
-      //
-      auto out_zip_it = thrust::make_zip_iterator(thrust::make_tuple(
-        d_out_src.begin(), d_out_dst.begin(), d_out_ranks.begin(), d_indices.begin()));
-
-      thrust::copy_n(handle.get_thrust_policy(), out_zip_it, add_sz, acc_zip_it);
-
-      // shuffle step: update input for self_rank
-      // zipping is necessary to preserve rank info during shuffle!
-      //
-      auto next_in_zip_begin =
-        thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.begin(), d_out_ranks.begin()));
-      auto next_in_zip_end =
-        thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.end(), d_out_ranks.end()));
-
-      update_input_by_rank(handle,
-                           graph_view,
-                           next_in_zip_begin,
-                           next_in_zip_end,
-                           static_cast<size_t>(self_rank),
-                           d_in,
-                           d_ranks,
-                           gpu_t{});
-
-      ++level;
-    }
-
-    return std::make_tuple(
-      std::move(d_acc_src), std::move(d_acc_dst), std::move(d_acc_ranks), std::move(d_acc_indices));
-  } else {
-    CUGRAPH_FAIL("Neighborhood sampling functionality is supported only for the multi-gpu case.");
-  }
-}
-
-}  // namespace original
-}  // namespace detail
-
-template <typename graph_view_t, typename gpu_t, typename index_t>
-std::tuple<std::tuple<rmm::device_uvector<typename graph_view_t::vertex_type>,
-                      rmm::device_uvector<typename graph_view_t::vertex_type>,
-                      rmm::device_uvector<gpu_t>,
-                      rmm::device_uvector<index_t>>,
-           std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t const& graph_view,
-                   typename graph_view_t::vertex_type const* ptr_d_start,
-                   gpu_t const* ptr_d_ranks,
-                   size_t num_starting_vs,
-                   std::vector<int> const& h_fan_out,
-                   bool flag_replacement)
-{
-  using vertex_t = typename graph_view_t::vertex_type;
-  using edge_t   = typename graph_view_t::edge_type;
-
-  size_t const self_rank = handle.get_comms().get_rank();
-
-  // shuffle input data to its corresponding rank;
-  // (Note: shuffle prims require mutable iterators)
-  //
-  detail::original::device_vec_t<vertex_t> d_start_vs(num_starting_vs, handle.get_stream());
-  detail::original::device_vec_t<gpu_t> d_ranks(num_starting_vs, handle.get_stream());
-  // ...hence copy required:
-  //
-  thrust::copy_n(handle.get_thrust_policy(), ptr_d_start, num_starting_vs, d_start_vs.begin());
-  thrust::copy_n(handle.get_thrust_policy(), ptr_d_ranks, num_starting_vs, d_ranks.begin());
-
-  // shuffle data to local rank:
-  //
-  auto next_in_zip_begin =
-    thrust::make_zip_iterator(thrust::make_tuple(d_start_vs.begin(), d_ranks.begin()));
-
-  auto next_in_zip_end =
-    thrust::make_zip_iterator(thrust::make_tuple(d_start_vs.end(), d_ranks.end()));
-
-  detail::original::update_input_by_rank(handle,
-                                         graph_view,
-                                         next_in_zip_begin,
-                                         next_in_zip_end,
-                                         self_rank,
-                                         d_start_vs,
-                                         d_ranks,
-                                         gpu_t{});
-
-  // preamble step for out-degree info:
-  //
-  auto&& [global_degree_offsets, global_out_degrees] =
-    detail::original::get_global_degree_information(handle, graph_view);
-  auto&& global_adjacency_list_offsets = detail::original::get_global_adjacency_offset(
-    handle, graph_view, global_degree_offsets, global_out_degrees);
-
-  // extract output quad SOA:
-  //
-  auto&& [d_src, d_dst, d_gpus, d_indices] =
-    detail::original::uniform_nbr_sample_impl(handle,
-                                              graph_view,
-                                              d_start_vs,
-                                              d_ranks,
-                                              h_fan_out,
-                                              global_out_degrees,
-                                              global_degree_offsets,
-                                              global_adjacency_list_offsets,
-                                              flag_replacement);
-
-  // shuffle quad SOA by d_gpus:
-  //
-  return detail::original::shuffle_to_target_gpu_ids(handle, d_src, d_dst, d_gpus, d_indices);
-}
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/nbr_sampling_mg.cu b/cpp/src/sampling/nbr_sampling_mg.cu
deleted file mode 100644
index efb79d3995a..00000000000
--- a/cpp/src/sampling/nbr_sampling_mg.cu
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cugraph/algorithms.hpp>
-
-#include "nbr_sampling_impl.cuh"
-
-namespace cugraph {
-// template explicit instantiation directives (EIDir's):
-//
-// SG FP32{
-template std::tuple<std::tuple<rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>>,
-                    std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int32_t, float, false, true> const& gview,
-                   int32_t const* ptr_d_start,
-                   int32_t const* ptr_d_ranks,
-                   size_t num_starting_vs,
-                   std::vector<int> const& h_fan_out,
-                   bool flag_replacement);
-
-template std::tuple<std::tuple<rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int64_t>>,
-                    std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int64_t, float, false, true> const& gview,
-                   int32_t const* ptr_d_start,
-                   int32_t const* ptr_d_ranks,
-                   size_t num_starting_vs,
-                   std::vector<int> const& h_fan_out,
-                   bool flag_replacement);
-
-template std::tuple<std::tuple<rmm::device_uvector<int64_t>,
-                               rmm::device_uvector<int64_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int64_t>>,
-                    std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t<int64_t, int64_t, float, false, true> const& gview,
-                   int64_t const* ptr_d_start,
-                   int32_t const* ptr_d_ranks,
-                   size_t num_starting_vs,
-                   std::vector<int> const& h_fan_out,
-                   bool flag_replacement);
-//}
-//
-// SG FP64{
-template std::tuple<std::tuple<rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>>,
-                    std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int32_t, double, false, true> const& gview,
-                   int32_t const* ptr_d_start,
-                   int32_t const* ptr_d_ranks,
-                   size_t num_starting_vs,
-                   std::vector<int> const& h_fan_out,
-                   bool flag_replacement);
-
-template std::tuple<std::tuple<rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int64_t>>,
-                    std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t<int32_t, int64_t, double, false, true> const& gview,
-                   int32_t const* ptr_d_start,
-                   int32_t const* ptr_d_ranks,
-                   size_t num_starting_vs,
-                   std::vector<int> const& h_fan_out,
-                   bool flag_replacement);
-
-template std::tuple<std::tuple<rmm::device_uvector<int64_t>,
-                               rmm::device_uvector<int64_t>,
-                               rmm::device_uvector<int32_t>,
-                               rmm::device_uvector<int64_t>>,
-                    std::vector<size_t>>
-uniform_nbr_sample(raft::handle_t const& handle,
-                   graph_view_t<int64_t, int64_t, double, false, true> const& gview,
-                   int64_t const* ptr_d_start,
-                   int32_t const* ptr_d_ranks,
-                   size_t num_starting_vs,
-                   std::vector<int> const& h_fan_out,
-                   bool flag_replacement);
-//}
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp
index 2748d75d2fc..9fe22d8a8d5 100644
--- a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp
+++ b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp
@@ -52,7 +52,6 @@ uniform_nbr_sample_impl(
   raft::host_span<const int> h_fan_out,
   rmm::device_uvector<typename graph_view_t::edge_type> const& global_out_degrees,
   rmm::device_uvector<typename graph_view_t::edge_type> const& global_degree_offsets,
-  rmm::device_uvector<typename graph_view_t::edge_type> const& global_adjacency_list_offsets,
   bool with_replacement,
   uint64_t seed)
 {
@@ -123,8 +122,7 @@ uniform_nbr_sample_impl(
                            d_in,
                            std::move(d_rnd_indices),
                            static_cast<edge_t>(k_level),
-                           global_degree_offsets,
-                           global_adjacency_list_offsets);
+                           global_degree_offsets);
     } else {
       std::tie(d_out_src, d_out_dst, d_out_indices) =
         gather_one_hop_edgelist(handle, graph_view, d_in);
@@ -183,8 +181,6 @@ uniform_nbr_sample(
   //
   auto&& [global_degree_offsets, global_out_degrees] =
     detail::get_global_degree_information(handle, graph_view);
-  auto&& global_adjacency_list_offsets = detail::get_global_adjacency_offset(
-    handle, graph_view, global_degree_offsets, global_out_degrees);
 
   return detail::uniform_nbr_sample_impl(handle,
                                          graph_view,
@@ -192,7 +188,6 @@ uniform_nbr_sample(
                                          fan_out,
                                          global_out_degrees,
                                          global_degree_offsets,
-                                         global_adjacency_list_offsets,
                                          with_replacement,
                                          seed);
 }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 26fc60e2cd6..a9841403723 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -610,10 +610,6 @@ if(BUILD_CUGRAPH_MG_TESTS)
         # - MG GATHER_ONE_HOP tests ---------------------------------------------------------------
         ConfigureTestMG(MG_GATHER_ONE_HOP_TEST sampling/detail/mg_gather_one_hop.cu)
 
-        ###########################################################################################
-        # - MG NBR SAMPLING tests -----------------------------------------------------------------
-        ConfigureTestMG(MG_NBR_SAMPLING_TEST sampling/detail/mg_nbr_sampling.cu)
-
         ###########################################################################################
         # - MG NBR SAMPLING tests -----------------------------------------------------------------
         ConfigureTestMG(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_uniform_neighbor_sampling.cu)
diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
index 8fbd80a90c0..46a54b74cc3 100644
--- a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
+++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
@@ -25,18 +25,18 @@ typedef int32_t vertex_t;
 typedef int32_t edge_t;
 typedef float weight_t;
 
-int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle,
-                                                      vertex_t* h_src,
-                                                      vertex_t* h_dst,
-                                                      edge_t* h_idx,
-                                                      size_t num_vertices,
-                                                      size_t num_edges,
-                                                      vertex_t* h_start,
-                                                      size_t num_starts,
-                                                      int* fan_out,
-                                                      size_t max_depth,
-                                                      bool_t with_replacement,
-                                                      bool_t store_transposed)
+int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle,
+                                         vertex_t* h_src,
+                                         vertex_t* h_dst,
+                                         edge_t* h_idx,
+                                         size_t num_vertices,
+                                         size_t num_edges,
+                                         vertex_t* h_start,
+                                         size_t num_starts,
+                                         int* fan_out,
+                                         size_t max_depth,
+                                         bool_t with_replacement,
+                                         bool_t store_transposed)
 {
   int test_ret_value = 0;
 
@@ -66,7 +66,7 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han
 
   h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32);
 
-  ret_code = cugraph_experimental_uniform_neighbor_sample(
+  ret_code = cugraph_uniform_neighbor_sample(
     handle, graph, d_start_view, h_fan_out_view, with_replacement, FALSE, &result, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -123,140 +123,6 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han
   return test_ret_value;
 }
 
-int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle,
-                                         vertex_t* h_src,
-                                         vertex_t* h_dst,
-                                         weight_t* h_wgt,
-                                         size_t num_vertices,
-                                         size_t num_edges,
-                                         vertex_t* h_start,
-                                         int* h_start_label,
-                                         size_t num_starts,
-                                         int* fan_out,
-                                         size_t max_depth,
-                                         bool_t with_replacement,
-                                         bool_t store_transposed)
-{
-  int test_ret_value = 0;
-
-  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
-  cugraph_error_t* ret_error    = NULL;
-
-  cugraph_graph_t* graph          = NULL;
-  cugraph_sample_result_t* result = NULL;
-
-  cugraph_type_erased_device_array_t* d_start                 = NULL;
-  cugraph_type_erased_device_array_view_t* d_start_view       = NULL;
-  cugraph_type_erased_device_array_t* d_start_label           = NULL;
-  cugraph_type_erased_device_array_view_t* d_start_label_view = NULL;
-  cugraph_type_erased_host_array_view_t* h_fan_out_view       = NULL;
-
-  ret_code = create_mg_test_graph(
-    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &graph, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed.");
-
-  d_start_view = cugraph_type_erased_device_array_view(d_start);
-
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, d_start_view, (byte_t*)h_start, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_label, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_label create failed.");
-
-  d_start_label_view = cugraph_type_erased_device_array_view(d_start_label);
-
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, d_start_label_view, (byte_t*)h_start_label, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
-
-  h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32);
-
-  ret_code = cugraph_uniform_neighbor_sample(handle,
-                                             graph,
-                                             d_start_view,
-                                             d_start_label_view,
-                                             h_fan_out_view,
-                                             with_replacement,
-                                             FALSE,
-                                             &result,
-                                             &ret_error);
-
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");
-
-  cugraph_type_erased_device_array_view_t* srcs;
-  cugraph_type_erased_device_array_view_t* dsts;
-  cugraph_type_erased_device_array_view_t* labels;
-  cugraph_type_erased_device_array_view_t* index;
-  cugraph_type_erased_host_array_view_t* counts;
-
-  srcs   = cugraph_sample_result_get_sources(result);
-  dsts   = cugraph_sample_result_get_destinations(result);
-  labels = cugraph_sample_result_get_start_labels(result);
-  index  = cugraph_sample_result_get_index(result);
-  counts = cugraph_sample_result_get_counts(result);
-
-  size_t result_size = cugraph_type_erased_device_array_view_size(srcs);
-
-  vertex_t h_srcs[result_size];
-  vertex_t h_dsts[result_size];
-  int h_labels[result_size];
-  edge_t h_index[result_size];
-  size_t* h_counts;
-
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_srcs, srcs, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_dsts, dsts, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_labels, labels, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_index, index, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  h_counts = (size_t*)cugraph_type_erased_host_array_pointer(counts);
-
-  //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
-  //  here we will do a simpler validation, merely checking that all edges
-  //  are actually part of the graph
-  weight_t M[num_vertices][num_vertices];
-
-  for (int i = 0; i < num_vertices; ++i)
-    for (int j = 0; j < num_vertices; ++j)
-      M[i][j] = 0.0;
-
-  for (int i = 0; i < num_edges; ++i)
-    M[h_src[i]][h_dst[i]] = h_wgt[i];
-
-  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
-    TEST_ASSERT(test_ret_value,
-                M[h_srcs[i]][h_dsts[i]] > 0.0,
-                "uniform_neighbor_sample got edge that doesn't exist");
-
-    bool_t found = FALSE;
-    for (int j = 0; j < num_starts; ++j)
-      found = found || (h_labels[i] == h_start_label[j]);
-
-    TEST_ASSERT(test_ret_value, found, "invalid label");
-  }
-
-  cugraph_type_erased_host_array_view_free(h_fan_out_view);
-
-  return test_ret_value;
-}
-
 int test_uniform_neighbor_sample(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 8;
@@ -264,21 +130,19 @@ int test_uniform_neighbor_sample(const cugraph_resource_handle_t* handle)
   size_t fan_out_size = 2;
   size_t num_starts   = 2;
 
-  vertex_t src[]          = {0, 1, 1, 2, 2, 2, 3, 4};
-  vertex_t dst[]          = {1, 3, 4, 0, 1, 3, 5, 5};
-  weight_t wgt[]          = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t start[]        = {2, 2};
-  vertex_t start_labels[] = {0, 1};
-  int fan_out[]           = {1, 2};
+  vertex_t src[]   = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[]   = {1, 3, 4, 0, 1, 3, 5, 5};
+  edge_t idx[]     = {0, 1, 2, 3, 4, 5, 6, 7};
+  vertex_t start[] = {2, 2};
+  int fan_out[]    = {1, 2};
 
   return generic_uniform_neighbor_sample_test(handle,
                                               src,
                                               dst,
-                                              wgt,
+                                              idx,
                                               num_vertices,
                                               num_edges,
                                               start,
-                                              start_labels,
                                               num_starts,
                                               fan_out,
                                               fan_out_size,
@@ -286,33 +150,6 @@ int test_uniform_neighbor_sample(const cugraph_resource_handle_t* handle)
                                               FALSE);
 }
 
-int test_experimental_uniform_neighbor_sample(const cugraph_resource_handle_t* handle)
-{
-  size_t num_edges    = 8;
-  size_t num_vertices = 6;
-  size_t fan_out_size = 2;
-  size_t num_starts   = 2;
-
-  vertex_t src[]   = {0, 1, 1, 2, 2, 2, 3, 4};
-  vertex_t dst[]   = {1, 3, 4, 0, 1, 3, 5, 5};
-  edge_t idx[]     = {0, 1, 2, 3, 4, 5, 6, 7};
-  vertex_t start[] = {2, 2};
-  int fan_out[]    = {1, 2};
-
-  return generic_experimental_uniform_neighbor_sample_test(handle,
-                                                           src,
-                                                           dst,
-                                                           idx,
-                                                           num_vertices,
-                                                           num_edges,
-                                                           start,
-                                                           num_starts,
-                                                           fan_out,
-                                                           fan_out_size,
-                                                           TRUE,
-                                                           FALSE);
-}
-
 /******************************************************************************/
 
 int main(int argc, char** argv)
@@ -340,7 +177,6 @@ int main(int argc, char** argv)
 
   if (result == 0) {
     result |= RUN_MG_TEST(test_uniform_neighbor_sample, handle);
-    result |= RUN_MG_TEST(test_experimental_uniform_neighbor_sample, handle);
 
     cugraph_free_resource_handle(handle);
   }
diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c
index 428ccbec7a9..180ab96566a 100644
--- a/cpp/tests/c_api/uniform_neighbor_sample_test.c
+++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c
@@ -110,18 +110,18 @@ int create_test_graph_with_edge_ids(const cugraph_resource_handle_t* p_handle,
   return test_ret_value;
 }
 
-int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src,
-                                                      vertex_t* h_dst,
-                                                      edge_t* h_ids,
-                                                      size_t num_vertices,
-                                                      size_t num_edges,
-                                                      vertex_t* h_start,
-                                                      size_t num_starts,
-                                                      int* fan_out,
-                                                      size_t max_depth,
-                                                      bool_t with_replacement,
-                                                      bool_t renumber,
-                                                      bool_t store_transposed)
+int generic_uniform_neighbor_sample_test(vertex_t* h_src,
+                                         vertex_t* h_dst,
+                                         edge_t* h_ids,
+                                         size_t num_vertices,
+                                         size_t num_edges,
+                                         vertex_t* h_start,
+                                         size_t num_starts,
+                                         int* fan_out,
+                                         size_t max_depth,
+                                         bool_t with_replacement,
+                                         bool_t renumber,
+                                         bool_t store_transposed)
 {
   int test_ret_value = 0;
 
@@ -155,7 +155,7 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src,
 
   h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32);
 
-  ret_code = cugraph_experimental_uniform_neighbor_sample(
+  ret_code = cugraph_uniform_neighbor_sample(
     handle, graph, d_start_view, h_fan_out_view, with_replacement, FALSE, &result, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -210,151 +210,6 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src,
   return test_ret_value;
 }
 
-int generic_uniform_neighbor_sample_test(vertex_t* h_src,
-                                         vertex_t* h_dst,
-                                         weight_t* h_wgt,
-                                         size_t num_vertices,
-                                         size_t num_edges,
-                                         vertex_t* h_start,
-                                         int* h_start_label,
-                                         size_t num_starts,
-                                         int* fan_out,
-                                         size_t max_depth,
-                                         bool_t with_replacement,
-                                         bool_t renumber,
-                                         bool_t store_transposed)
-{
-  int test_ret_value = 0;
-
-  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
-  cugraph_error_t* ret_error    = NULL;
-
-  cugraph_resource_handle_t* handle = NULL;
-  cugraph_graph_t* graph            = NULL;
-  cugraph_sample_result_t* result   = NULL;
-
-  cugraph_type_erased_device_array_t* d_start                 = NULL;
-  cugraph_type_erased_device_array_view_t* d_start_view       = NULL;
-  cugraph_type_erased_device_array_t* d_start_label           = NULL;
-  cugraph_type_erased_device_array_view_t* d_start_label_view = NULL;
-  cugraph_type_erased_host_array_view_t* h_fan_out_view       = NULL;
-
-  handle = cugraph_create_resource_handle(NULL);
-  TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
-
-  ret_code = create_test_graph(
-    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, renumber, FALSE, &graph, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed.");
-
-  d_start_view = cugraph_type_erased_device_array_view(d_start);
-
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, d_start_view, (byte_t*)h_start, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_label, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_label create failed.");
-
-  d_start_label_view = cugraph_type_erased_device_array_view(d_start_label);
-
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, d_start_label_view, (byte_t*)h_start_label, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start_label copy_from_host failed.");
-
-  h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32);
-
-  ret_code = cugraph_uniform_neighbor_sample(handle,
-                                             graph,
-                                             d_start_view,
-                                             d_start_label_view,
-                                             h_fan_out_view,
-                                             with_replacement,
-                                             FALSE,
-                                             &result,
-                                             &ret_error);
-
-  TEST_ASSERT(test_ret_value,
-              ret_code != CUGRAPH_SUCCESS,
-              "cugraph_uniform_neighbor_sample expected to fail in SG test");
-
-#if 0
-  // FIXME:  cugraph_uniform_neighbor_sample does not support SG
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");
-
-  cugraph_type_erased_device_array_view_t* srcs;
-  cugraph_type_erased_device_array_view_t* dsts;
-  cugraph_type_erased_device_array_view_t* labels;
-  cugraph_type_erased_device_array_view_t* index;
-  cugraph_type_erased_host_array_view_t* counts;
-
-  srcs   = cugraph_sample_result_get_sources(result);
-  dsts   = cugraph_sample_result_get_destinations(result);
-  labels = cugraph_sample_result_get_start_labels(result);
-  index  = cugraph_sample_result_get_index(result);
-  counts = cugraph_sample_result_get_counts(result);
-
-  size_t result_size = cugraph_type_erased_device_array_view_size(srcs);
-
-  vertex_t h_srcs[result_size];
-  vertex_t h_dsts[result_size];
-  int h_labels[result_size];
-  edge_t h_index[result_size];
-  size_t* h_counts;
-
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_srcs, srcs, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_dsts, dsts, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_labels, labels, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code =
-    cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_index, index, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  h_counts = (size_t*)cugraph_type_erased_host_array_pointer(counts);
-
-  //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
-  //  here we will do a simpler validation, merely checking that all edges
-  //  are actually part of the graph
-  weight_t M[num_vertices][num_vertices];
-
-  for (int i = 0; i < num_vertices; ++i)
-    for (int j = 0; j < num_vertices; ++j)
-      M[i][j] = 0.0;
-
-  for (int i = 0; i < num_edges; ++i)
-    M[h_src[i]][h_dst[i]] = h_wgt[i];
-
-  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
-    TEST_ASSERT(test_ret_value,
-                M[h_srcs[i]][h_dsts[i]] > 0.0,
-                "uniform_neighbor_sample got edge that doesn't exist");
-
-    bool_t found = FALSE;
-    for (int j = 0; j < num_starts; ++j)
-      found = found || (h_labels[i] == h_start_label[j]);
-
-    TEST_ASSERT(test_ret_value, found, "invalid label");
-  }
-
-  cugraph_type_erased_host_array_view_free(h_fan_out_view);
-#endif
-
-  return test_ret_value;
-}
-
 int test_uniform_neighbor_sample()
 {
   size_t num_edges    = 8;
@@ -362,20 +217,18 @@ int test_uniform_neighbor_sample()
   size_t fan_out_size = 2;
   size_t num_starts   = 2;
 
-  vertex_t src[]          = {0, 1, 1, 2, 2, 2, 3, 4};
-  vertex_t dst[]          = {1, 3, 4, 0, 1, 3, 5, 5};
-  weight_t wgt[]          = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t start[]        = {2, 2};
-  vertex_t start_labels[] = {0, 1};
-  int fan_out[]           = {1, 2};
+  vertex_t src[]    = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t dst[]    = {1, 3, 4, 0, 1, 3, 5, 5};
+  edge_t edge_ids[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  vertex_t start[]  = {2, 2};
+  int fan_out[]     = {1, 2};
 
   return generic_uniform_neighbor_sample_test(src,
                                               dst,
-                                              wgt,
+                                              edge_ids,
                                               num_vertices,
                                               num_edges,
                                               start,
-                                              start_labels,
                                               num_starts,
                                               fan_out,
                                               fan_out_size,
@@ -384,37 +237,9 @@ int test_uniform_neighbor_sample()
                                               FALSE);
 }
 
-int test_experimental_uniform_neighbor_sample()
-{
-  size_t num_edges    = 8;
-  size_t num_vertices = 6;
-  size_t fan_out_size = 2;
-  size_t num_starts   = 2;
-
-  vertex_t src[]    = {0, 1, 1, 2, 2, 2, 3, 4};
-  vertex_t dst[]    = {1, 3, 4, 0, 1, 3, 5, 5};
-  edge_t edge_ids[] = {0, 1, 2, 3, 4, 5, 6, 7};
-  vertex_t start[]  = {2, 2};
-  int fan_out[]     = {1, 2};
-
-  return generic_experimental_uniform_neighbor_sample_test(src,
-                                                           dst,
-                                                           edge_ids,
-                                                           num_vertices,
-                                                           num_edges,
-                                                           start,
-                                                           num_starts,
-                                                           fan_out,
-                                                           fan_out_size,
-                                                           TRUE,
-                                                           FALSE,
-                                                           FALSE);
-}
-
 int main(int argc, char** argv)
 {
   int result = 0;
   result |= RUN_TEST(test_uniform_neighbor_sample);
-  result |= RUN_TEST(test_experimental_uniform_neighbor_sample);
   return result;
 }
diff --git a/cpp/tests/sampling/detail/mg_gather_one_hop.cu b/cpp/tests/sampling/detail/mg_gather_one_hop.cu
index 7e7dc50bb09..11e3df78f8e 100644
--- a/cpp/tests/sampling/detail/mg_gather_one_hop.cu
+++ b/cpp/tests/sampling/detail/mg_gather_one_hop.cu
@@ -15,6 +15,9 @@
  */
 
 #include "nbr_sampling_utils.cuh"
+
+#include <sampling/detail/graph_functions.hpp>
+
 #include <raft/comms/comms.hpp>
 #include <raft/comms/mpi_comms.hpp>
 
@@ -92,49 +95,43 @@ class Tests_MG_GatherEdges
     // Generate random vertex ids in the range of current gpu
 
     auto [global_degree_offsets, global_out_degrees] =
-      cugraph::detail::original::get_global_degree_information(handle, mg_graph_view);
-    auto global_adjacency_list_offsets = cugraph::detail::original::get_global_adjacency_offset(
-      handle, mg_graph_view, global_degree_offsets, global_out_degrees);
+      cugraph::detail::get_global_degree_information(handle, mg_graph_view);
 
     // Generate random sources to gather on
     auto random_sources =
       cugraph::test::random_vertex_ids(handle,
                                        mg_graph_view.local_vertex_partition_range_first(),
                                        mg_graph_view.local_vertex_partition_range_last(),
-                                       source_sample_count,
+                                       std::min(mg_graph_view.local_vertex_partition_range_size() *
+                                                  (repetitions_per_vertex + vertex_t{1}),
+                                                source_sample_count),
                                        repetitions_per_vertex);
-    rmm::device_uvector<int> random_source_gpu_ids(random_sources.size(), handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(),
-                 random_source_gpu_ids.begin(),
-                 random_source_gpu_ids.end(),
-                 comm_rank);
-
-    auto [active_sources, active_source_gpu_ids] =
-      cugraph::detail::original::gather_active_majors(handle,
-                                                      mg_graph_view,
-                                                      random_sources.cbegin(),
-                                                      random_sources.cend(),
-                                                      random_source_gpu_ids.cbegin());
-
-    auto [src, dst, gpu_ids, edge_ids] = cugraph::detail::original::gather_one_hop_edgelist(
-      handle, mg_graph_view, active_sources, active_source_gpu_ids, global_adjacency_list_offsets);
+
+    // FIXME: allgather is probably a poor name for this function.
+    //        It's really an allgather across the row communicator
+    auto active_sources =
+      cugraph::detail::allgather_active_majors(handle, std::move(random_sources));
+
+    auto [src, dst, edge_ids] =
+      cugraph::detail::gather_one_hop_edgelist(handle, mg_graph_view, active_sources);
 
     if (prims_usecase.check_correctness) {
-      // Gather outputs
-      auto mg_out_srcs = cugraph::test::device_gatherv(handle, src.data(), src.size());
-      auto mg_out_dsts = cugraph::test::device_gatherv(handle, dst.data(), dst.size());
-      auto mg_out_prop = cugraph::test::device_gatherv(handle, gpu_ids.data(), gpu_ids.size());
-
-      auto mg_out_edge_ids =
-        cugraph::test::device_gatherv(handle, edge_ids.data(), edge_ids.size());
-
-      // Gather inputs
-      auto& col_comm         = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_rank    = col_comm.get_rank();
-      auto sg_active_sources = cugraph::test::device_gatherv(
-        handle, active_sources.data(), col_rank == 0 ? active_sources.size() : 0);
-      auto sg_active_sources_gpu_ids = cugraph::test::device_gatherv(
-        handle, active_source_gpu_ids.data(), col_rank == 0 ? active_source_gpu_ids.size() : 0);
+      // Gather outputs to gpu 0
+      auto mg_out_srcs = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{src.data(), src.size()});
+      auto mg_out_dsts = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{dst.data(), dst.size()});
+
+      // Gather relevant edges from graph
+      auto& col_comm      = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_rank = col_comm.get_rank();
+      auto all_active_sources = cugraph::test::device_allgatherv(
+        handle,
+        raft::device_span<vertex_t const>{active_sources.data(),
+                                          col_rank == 0 ? active_sources.size() : 0});
+
+      thrust::sort(
+        handle.get_thrust_policy(), all_active_sources.begin(), all_active_sources.end());
 
       // Gather input graph edgelist
       rmm::device_uvector<vertex_t> sg_src(0, handle.get_stream());
@@ -142,42 +139,57 @@ class Tests_MG_GatherEdges
       std::tie(sg_src, sg_dst, std::ignore) =
         mg_graph_view.decompress_to_edgelist(handle, std::nullopt);
 
-      auto aggregated_sg_src = cugraph::test::device_gatherv(handle, sg_src.begin(), sg_src.size());
-      auto aggregated_sg_dst = cugraph::test::device_gatherv(handle, sg_dst.begin(), sg_dst.size());
-
-      sort_coo(handle, mg_out_srcs, mg_out_prop, mg_out_dsts);
-
-      if (handle.get_comms().get_rank() == int{0}) {
-        cugraph::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(handle);
-        auto aggregated_edge_iter = thrust::make_zip_iterator(
-          thrust::make_tuple(aggregated_sg_src.begin(), aggregated_sg_dst.begin()));
-        thrust::sort(handle.get_thrust_policy(),
-                     aggregated_edge_iter,
-                     aggregated_edge_iter + aggregated_sg_src.size());
-        auto sg_graph_properties =
-          cugraph::graph_properties_t{mg_graph_view.is_symmetric(), mg_graph_view.is_multigraph()};
-
-        std::tie(sg_graph, std::ignore) =
-          cugraph::create_graph_from_edgelist<vertex_t, edge_t, weight_t, false, false>(
-            handle,
-            std::nullopt,
-            std::move(aggregated_sg_src),
-            std::move(aggregated_sg_dst),
-            std::nullopt,
-            sg_graph_properties,
-            false);
-        auto sg_graph_view = sg_graph.view();
-        // Call single gpu gather
-        auto [sg_out_srcs, sg_out_dsts, sg_out_prop] =
-          sg_gather_edges(handle, sg_graph_view, sg_active_sources, sg_active_sources_gpu_ids);
-        sort_coo(handle, sg_out_srcs, sg_out_prop, sg_out_dsts);
-
-        auto passed = thrust::equal(
-          handle.get_thrust_policy(), sg_out_srcs.begin(), sg_out_srcs.end(), mg_out_srcs.begin());
-        passed &= thrust::equal(
-          handle.get_thrust_policy(), sg_out_dsts.begin(), sg_out_dsts.end(), mg_out_dsts.begin());
-        ASSERT_TRUE(passed);
-      }
+      auto begin_iter = thrust::make_zip_iterator(sg_src.begin(), sg_dst.begin());
+      auto new_end    = thrust::remove_if(
+        handle.get_thrust_policy(),
+        begin_iter,
+        begin_iter + sg_src.size(),
+        [sources = all_active_sources.data(), size = all_active_sources.size()] __device__(auto t) {
+          auto src = thrust::get<0>(t);
+          return !thrust::binary_search(thrust::seq, sources, sources + size, src);
+        });
+
+      sg_src.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+      sg_dst.resize(thrust::distance(begin_iter, new_end), handle.get_stream());
+
+      auto aggregated_sg_src = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{sg_src.begin(), sg_src.size()});
+      auto aggregated_sg_dst = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{sg_dst.begin(), sg_dst.size()});
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()),
+                   thrust::make_zip_iterator(mg_out_srcs.end(), mg_out_dsts.end()));
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(aggregated_sg_src.begin(), aggregated_sg_dst.begin()),
+                   thrust::make_zip_iterator(aggregated_sg_src.end(), aggregated_sg_dst.end()));
+
+      // FIXME:  This is ignoring the case of the same seed being specified multiple
+      //         times.  Not sure that's worth worrying about, so taking the easy way out here.
+      auto unique_end =
+        thrust::unique(handle.get_thrust_policy(),
+                       thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()),
+                       thrust::make_zip_iterator(mg_out_srcs.end(), mg_out_dsts.end()));
+
+      mg_out_srcs.resize(
+        thrust::distance(thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()),
+                         unique_end),
+        handle.get_stream());
+      mg_out_dsts.resize(
+        thrust::distance(thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()),
+                         unique_end),
+        handle.get_stream());
+
+      auto passed = thrust::equal(handle.get_thrust_policy(),
+                                  mg_out_srcs.begin(),
+                                  mg_out_srcs.end(),
+                                  aggregated_sg_src.begin());
+      passed &= thrust::equal(handle.get_thrust_policy(),
+                              mg_out_dsts.begin(),
+                              mg_out_dsts.end(),
+                              aggregated_sg_dst.begin());
+      ASSERT_TRUE(passed);
     }
   }
 };
diff --git a/cpp/tests/sampling/detail/mg_gather_utils.cu b/cpp/tests/sampling/detail/mg_gather_utils.cu
index 6ea0e40c60a..dc0a2fb2cc1 100644
--- a/cpp/tests/sampling/detail/mg_gather_utils.cu
+++ b/cpp/tests/sampling/detail/mg_gather_utils.cu
@@ -15,9 +15,14 @@
  */
 
 #include "nbr_sampling_utils.cuh"
+
+#include <sampling/detail/graph_functions.hpp>
+
 #include <raft/comms/comms.hpp>
 #include <raft/comms/mpi_comms.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <gtest/gtest.h>
 
 #include <thrust/equal.h>
@@ -30,6 +35,102 @@ struct Prims_Usecase {
   bool check_correctness{true};
 };
 
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<std::vector<vertex_t>, std::vector<vertex_t>> test_gather_local_edges(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, weight_t, false, true> const& mg_graph_view,
+  rmm::device_uvector<vertex_t> const& sources,
+  rmm::device_uvector<edge_t> const& destination_offsets,
+  edge_t indices_per_source)
+{
+  auto& col_comm      = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_rank = col_comm.get_rank();
+
+  // logic relies on gather_one_hop not having duplicates
+  rmm::device_uvector<vertex_t> sources_copy(sources.size(), handle.get_stream());
+  raft::copy(sources_copy.data(), sources.data(), sources.size(), handle.get_stream());
+  thrust::sort(handle.get_thrust_policy(), sources_copy.begin(), sources_copy.end());
+  auto sources_copy_end =
+    thrust::unique(handle.get_thrust_policy(), sources_copy.begin(), sources_copy.end());
+  sources_copy.resize(thrust::distance(sources_copy.begin(), sources_copy_end),
+                      handle.get_stream());
+
+  auto [one_hop_src, one_hop_dst, one_hop_edge_ids] =
+    cugraph::detail::gather_one_hop_edgelist(handle, mg_graph_view, sources_copy);
+
+  rmm::device_uvector<int> one_hop_gpu_id(one_hop_src.size(), handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(),
+               one_hop_gpu_id.begin(),
+               one_hop_gpu_id.end(),
+               handle.get_comms().get_rank());
+
+  // Pull everything to rank 0
+  auto sg_src = cugraph::test::device_gatherv(
+    handle, raft::device_span<vertex_t const>{one_hop_src.data(), one_hop_src.size()});
+  auto sg_dst = cugraph::test::device_gatherv(
+    handle, raft::device_span<vertex_t const>{one_hop_dst.data(), one_hop_dst.size()});
+  auto sg_gpu_id = cugraph::test::device_gatherv(
+    handle, raft::device_span<int const>{one_hop_gpu_id.data(), one_hop_gpu_id.size()});
+  auto sg_sources = cugraph::test::device_gatherv(
+    handle, raft::device_span<vertex_t const>{sources.data(), col_rank == 0 ? sources.size() : 0});
+  auto sg_destination_offsets = cugraph::test::device_gatherv(
+    handle,
+    raft::device_span<edge_t const>{destination_offsets.data(),
+                                    col_rank == 0 ? destination_offsets.size() : 0});
+
+  thrust::sort(handle.get_thrust_policy(),
+               thrust::make_zip_iterator(sg_src.begin(), sg_gpu_id.begin(), sg_dst.begin()),
+               thrust::make_zip_iterator(sg_src.end(), sg_gpu_id.end(), sg_dst.end()));
+
+  std::vector<vertex_t> h_sources(sg_sources.size());
+  std::vector<vertex_t> h_src(sg_src.size());
+  std::vector<vertex_t> h_dst(sg_dst.size());
+  std::vector<vertex_t> h_result_src(sg_destination_offsets.size());
+  std::vector<vertex_t> h_result_dst(sg_destination_offsets.size());
+  std::vector<edge_t> h_destination_offsets(sg_destination_offsets.size());
+
+  raft::update_host(h_sources.data(), sg_sources.data(), sg_sources.size(), handle.get_stream());
+  raft::update_host(h_src.data(), sg_src.data(), sg_src.size(), handle.get_stream());
+  raft::update_host(h_dst.data(), sg_dst.data(), sg_dst.size(), handle.get_stream());
+  raft::update_host(h_destination_offsets.data(),
+                    sg_destination_offsets.data(),
+                    sg_destination_offsets.size(),
+                    handle.get_stream());
+
+  thrust::for_each(thrust::host,
+                   thrust::make_counting_iterator<size_t>(0),
+                   thrust::make_counting_iterator<size_t>(sg_destination_offsets.size()),
+                   [&] __host__(auto i) {
+                     h_result_src[i] = h_sources[i / indices_per_source];
+                     h_result_dst[i] = mg_graph_view.number_of_vertices();
+                     edge_t offset   = h_destination_offsets[i];
+
+                     for (size_t j = 0; j < h_src.size(); ++j) {
+                       if (h_result_src[i] == h_src[j]) {
+                         if (offset == 0) {
+                           h_result_dst[i] = h_dst[j];
+                           break;
+                         }
+                         --offset;
+                       }
+                     }
+                   });
+
+  auto new_end =
+    thrust::remove_if(thrust::host,
+                      thrust::make_zip_iterator(h_result_src.begin(), h_result_dst.begin()),
+                      thrust::make_zip_iterator(h_result_src.end(), h_result_dst.end()),
+                      [invalid_vertex = mg_graph_view.number_of_vertices()] __host__(auto p) {
+                        return (thrust::get<1>(p) == invalid_vertex);
+                      });
+
+  h_result_src.resize(thrust::distance(
+    thrust::make_zip_iterator(h_result_src.begin(), h_result_dst.begin()), new_end));
+  h_result_dst.resize(h_result_src.size());
+
+  return std::make_tuple(std::move(h_result_src), std::move(h_result_dst));
+}
+
 template <typename input_usecase_t>
 class Tests_MG_GatherEdges
   : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
@@ -93,118 +194,77 @@ class Tests_MG_GatherEdges
     // Generate random vertex ids in the range of current gpu
 
     auto [global_degree_offsets, global_out_degrees] =
-      cugraph::detail::original::get_global_degree_information(handle, mg_graph_view);
-    auto global_adjacency_list_offsets = cugraph::detail::original::get_global_adjacency_offset(
-      handle, mg_graph_view, global_degree_offsets, global_out_degrees);
+      cugraph::detail::get_global_degree_information(handle, mg_graph_view);
 
     // Generate random sources to gather on
-    auto random_sources = random_vertex_ids(handle,
-                                            mg_graph_view.local_vertex_partition_range_first(),
-                                            mg_graph_view.local_vertex_partition_range_last(),
-                                            source_sample_count,
-                                            repetitions_per_vertex);
-    rmm::device_uvector<int> random_source_gpu_ids(random_sources.size(), handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(),
-                 random_source_gpu_ids.begin(),
-                 random_source_gpu_ids.end(),
-                 comm_rank);
-
-    auto [active_sources, active_source_gpu_ids] =
-      cugraph::detail::original::gather_active_majors(handle,
-                                                      mg_graph_view,
-                                                      random_sources.cbegin(),
-                                                      random_sources.cend(),
-                                                      random_source_gpu_ids.cbegin());
+    auto random_sources =
+      random_vertex_ids(handle,
+                        mg_graph_view.local_vertex_partition_range_first(),
+                        mg_graph_view.local_vertex_partition_range_last(),
+                        std::min(mg_graph_view.local_vertex_partition_range_size() *
+                                   (repetitions_per_vertex + vertex_t{1}),
+                                 source_sample_count),
+                        repetitions_per_vertex);
+
+    // FIXME: allgather is probably a poor name for this function.
+    //        It's really an allgather across the row communicator
+    auto active_sources =
+      cugraph::detail::allgather_active_majors(handle, std::move(random_sources));
 
     // get source global out degrees to generate indices
-    auto active_source_degrees = cugraph::detail::original::get_active_major_global_degrees(
+    auto active_source_degrees = cugraph::detail::get_active_major_global_degrees(
       handle, mg_graph_view, active_sources, global_out_degrees);
 
-    auto random_destination_indices =
+    auto random_destination_offsets =
       generate_random_destination_indices(handle,
                                           active_source_degrees,
                                           mg_graph_view.number_of_vertices(),
-                                          mg_graph_view.number_of_edges(),
+                                          edge_t{-1},
                                           indices_per_source);
-    rmm::device_uvector<edge_t> input_destination_indices(random_destination_indices.size(),
+
+    rmm::device_uvector<edge_t> input_destination_offsets(random_destination_offsets.size(),
                                                           handle.get_stream());
-    raft::update_device(input_destination_indices.data(),
-                        random_destination_indices.data(),
-                        random_destination_indices.size(),
-                        handle.get_stream());
-
-    auto [src, dst, gpu_ids, dst_map] =
-      cugraph::detail::original::gather_local_edges(handle,
-                                                    mg_graph_view,
-                                                    active_sources,
-                                                    active_source_gpu_ids,
-                                                    std::move(input_destination_indices),
-                                                    indices_per_source,
-                                                    global_degree_offsets,
-                                                    global_adjacency_list_offsets);
+    raft::copy(input_destination_offsets.data(),
+               random_destination_offsets.data(),
+               random_destination_offsets.size(),
+               handle.get_stream());
+
+    auto [src, dst, dst_map] =
+      cugraph::detail::gather_local_edges(handle,
+                                          mg_graph_view,
+                                          active_sources,
+                                          std::move(random_destination_offsets),
+                                          indices_per_source,
+                                          global_degree_offsets);
 
     if (prims_usecase.check_correctness) {
-      // Gather outputs
-      auto mg_out_srcs = cugraph::test::device_gatherv(handle, src.data(), src.size());
-      auto mg_out_dsts = cugraph::test::device_gatherv(handle, dst.data(), dst.size());
-
-      // Gather inputs
-      auto& col_comm      = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_rank = col_comm.get_rank();
-      auto sg_random_srcs = cugraph::test::device_gatherv(
-        handle, active_sources.data(), col_rank == 0 ? active_sources.size() : 0);
-      auto sg_random_dst_indices =
-        cugraph::test::device_gatherv(handle,
-                                      random_destination_indices.data(),
-                                      col_rank == 0 ? random_destination_indices.size() : 0);
-
-      // Gather input graph edgelist
-      rmm::device_uvector<vertex_t> sg_src(0, handle.get_stream());
-      rmm::device_uvector<vertex_t> sg_dst(0, handle.get_stream());
-      std::tie(sg_src, sg_dst, std::ignore) =
-        mg_graph_view.decompress_to_edgelist(handle, std::nullopt);
-
-      auto aggregated_sg_src = cugraph::test::device_gatherv(handle, sg_src.begin(), sg_src.size());
-      auto aggregated_sg_dst = cugraph::test::device_gatherv(handle, sg_dst.begin(), sg_dst.size());
-
-      sort_coo(handle, mg_out_srcs, mg_out_dsts);
-
-      if (handle.get_comms().get_rank() == int{0}) {
-        cugraph::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(handle);
-        auto aggregated_edge_iter = thrust::make_zip_iterator(
-          thrust::make_tuple(aggregated_sg_src.begin(), aggregated_sg_dst.begin()));
-        thrust::sort(handle.get_thrust_policy(),
-                     aggregated_edge_iter,
-                     aggregated_edge_iter + aggregated_sg_src.size());
-        auto sg_graph_properties =
-          cugraph::graph_properties_t{mg_graph_view.is_symmetric(), mg_graph_view.is_multigraph()};
-
-        std::tie(sg_graph, std::ignore) =
-          cugraph::create_graph_from_edgelist<vertex_t, edge_t, weight_t, false, false>(
-            handle,
-            std::nullopt,
-            std::move(aggregated_sg_src),
-            std::move(aggregated_sg_dst),
-            std::nullopt,
-            sg_graph_properties,
-            false);
-        auto sg_graph_view = sg_graph.view();
-        // Call single gpu gather
-        auto [sg_out_srcs, sg_out_dsts] = sg_gather_edges(handle,
-                                                          sg_graph_view,
-                                                          sg_random_srcs.begin(),
-                                                          sg_random_srcs.end(),
-                                                          sg_random_dst_indices.begin(),
-                                                          sg_graph_view.number_of_vertices(),
-                                                          indices_per_source);
-        sort_coo(handle, sg_out_srcs, sg_out_dsts);
-
-        auto passed = thrust::equal(
-          handle.get_thrust_policy(), sg_out_srcs.begin(), sg_out_srcs.end(), mg_out_srcs.begin());
-        passed &= thrust::equal(
-          handle.get_thrust_policy(), sg_out_dsts.begin(), sg_out_dsts.end(), mg_out_dsts.begin());
-        ASSERT_TRUE(passed);
-      }
+      // NOTE: This test assumes that edgea within the data structure are sorted
+      //  We'll use gather_one_hop_edgelist to pull out the relevant edges
+      auto [h_src, h_dst] = test_gather_local_edges(
+        handle, mg_graph_view, active_sources, input_destination_offsets, indices_per_source);
+
+      auto agg_src = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{src.data(), src.size()});
+      auto agg_dst = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{dst.data(), dst.size()});
+
+      thrust::sort(handle.get_thrust_policy(),
+                   thrust::make_zip_iterator(agg_src.begin(), agg_dst.begin()),
+                   thrust::make_zip_iterator(agg_src.end(), agg_dst.end()));
+      thrust::sort(thrust::host,
+                   thrust::make_zip_iterator(h_src.begin(), h_dst.begin()),
+                   thrust::make_zip_iterator(h_src.end(), h_dst.end()));
+
+      std::vector<vertex_t> h_agg_src(agg_src.size());
+      std::vector<vertex_t> h_agg_dst(agg_dst.size());
+      raft::update_host(h_agg_src.data(), agg_src.data(), agg_src.size(), handle.get_stream());
+      raft::update_host(h_agg_dst.data(), agg_dst.data(), agg_dst.size(), handle.get_stream());
+
+      // FIXME:  Why are the randomly selected vertices on each GPU so similar??
+
+      auto passed = thrust::equal(thrust::host, h_src.begin(), h_src.end(), h_agg_src.begin());
+      passed &= thrust::equal(thrust::host, h_dst.begin(), h_dst.end(), h_agg_dst.begin());
+      ASSERT_TRUE(passed);
     }
   }
 };
diff --git a/cpp/tests/sampling/detail/nbr_sampling_utils.cuh b/cpp/tests/sampling/detail/nbr_sampling_utils.cuh
index 96e6d129b24..093c231dfce 100644
--- a/cpp/tests/sampling/detail/nbr_sampling_utils.cuh
+++ b/cpp/tests/sampling/detail/nbr_sampling_utils.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <cugraph/algorithms.hpp>
-#include <cugraph/detail/graph_functions.cuh>
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
diff --git a/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu b/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu
index a036dda5fb1..12a387d4589 100644
--- a/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu
+++ b/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu
@@ -90,12 +90,15 @@ class Tests_MG_Nbr_Sampling
     constexpr vertex_t source_sample_count    = 3;
 
     // Generate random vertex ids in the range of current gpu
-    auto random_sources = random_vertex_ids(handle,
-                                            mg_graph_view.local_vertex_partition_range_first(),
-                                            mg_graph_view.local_vertex_partition_range_last(),
-                                            source_sample_count,
-                                            repetitions_per_vertex,
-                                            comm_rank);
+    auto random_sources =
+      random_vertex_ids(handle,
+                        mg_graph_view.local_vertex_partition_range_first(),
+                        mg_graph_view.local_vertex_partition_range_last(),
+                        std::min(mg_graph_view.local_vertex_partition_range_size() *
+                                   (repetitions_per_vertex + vertex_t{1}),
+                                 source_sample_count),
+                        repetitions_per_vertex,
+                        comm_rank);
 
     std::vector<int> h_fan_out{indices_per_source};  // depth = 1
 
@@ -108,14 +111,14 @@ class Tests_MG_Nbr_Sampling
 
     if (prims_usecase.check_correctness) {
       // Consolidate results on GPU 0
-      auto d_mg_start_src =
-        cugraph::test::device_gatherv(handle, random_sources.data(), random_sources.size());
-      auto d_mg_aggregate_src =
-        cugraph::test::device_gatherv(handle, d_src_out.data(), d_src_out.size());
-      auto d_mg_aggregate_dst =
-        cugraph::test::device_gatherv(handle, d_dst_out.data(), d_dst_out.size());
-      auto d_mg_aggregate_indices =
-        cugraph::test::device_gatherv(handle, d_indices.data(), d_indices.size());
+      auto d_mg_start_src = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto d_mg_aggregate_src = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{d_src_out.data(), d_src_out.size()});
+      auto d_mg_aggregate_dst = cugraph::test::device_gatherv(
+        handle, raft::device_span<vertex_t const>{d_dst_out.data(), d_dst_out.size()});
+      auto d_mg_aggregate_indices = cugraph::test::device_gatherv(
+        handle, raft::device_span<weight_t const>{d_indices.data(), d_indices.size()});
 
 #if 0
       // FIXME:  extract_induced_subgraphs not currently support MG, so we'll skip this validation
diff --git a/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu b/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu
index 241de0bf747..346c6e1d449 100644
--- a/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu
+++ b/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu
@@ -72,7 +72,9 @@ class Tests_Uniform_Neighbor_Sampling
       cugraph::test::random_vertex_ids(handle,
                                        graph_view.local_vertex_partition_range_first(),
                                        graph_view.local_vertex_partition_range_last(),
-                                       source_sample_count,
+                                       std::min(graph_view.local_vertex_partition_range_size() *
+                                                  (repetitions_per_vertex + vertex_t{1}),
+                                                source_sample_count),
                                        repetitions_per_vertex,
                                        uint64_t{0});
 
diff --git a/cpp/tests/utilities/device_comm_wrapper.cu b/cpp/tests/utilities/device_comm_wrapper.cu
index 9937e59273d..9ce1f35a33c 100644
--- a/cpp/tests/utilities/device_comm_wrapper.cu
+++ b/cpp/tests/utilities/device_comm_wrapper.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,11 +26,13 @@ namespace cugraph {
 namespace test {
 
 template <typename T>
-rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size)
+rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle,
+                                      raft::device_span<T const> d_input)
+
 {
   bool is_root = handle.get_comms().get_rank() == int{0};
   auto rx_sizes =
-    cugraph::host_scalar_gather(handle.get_comms(), size, int{0}, handle.get_stream());
+    cugraph::host_scalar_gather(handle.get_comms(), d_input.size(), int{0}, handle.get_stream());
   std::vector<size_t> rx_displs(is_root ? static_cast<size_t>(handle.get_comms().get_size())
                                         : size_t{0});
   if (is_root) { std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); }
@@ -39,9 +41,9 @@ rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle, T const* d_i
     is_root ? std::reduce(rx_sizes.begin(), rx_sizes.end()) : size_t{0}, handle.get_stream());
 
   cugraph::device_gatherv(handle.get_comms(),
-                          d_input,
+                          d_input.data(),
                           gathered_v.data(),
-                          size,
+                          d_input.size(),
                           rx_sizes,
                           rx_displs,
                           int{0},
@@ -50,23 +52,53 @@ rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle, T const* d_i
   return gathered_v;
 }
 
+template <typename T>
+rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
+                                         raft::device_span<T const> d_input)
+{
+  auto rx_sizes =
+    cugraph::host_scalar_allgather(handle.get_comms(), d_input.size(), handle.get_stream());
+  std::vector<size_t> rx_displs(static_cast<size_t>(handle.get_comms().get_size()));
+  std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+
+  rmm::device_uvector<T> gathered_v(std::reduce(rx_sizes.begin(), rx_sizes.end()),
+                                    handle.get_stream());
+
+  cugraph::device_allgatherv(handle.get_comms(),
+                             d_input.data(),
+                             gathered_v.data(),
+                             rx_sizes,
+                             rx_displs,
+                             handle.get_stream());
+
+  return gathered_v;
+}
+
 // explicit instantiation
 
 template rmm::device_uvector<int32_t> device_gatherv(raft::handle_t const& handle,
-                                                     int32_t const* d_input,
-                                                     size_t size);
+                                                     raft::device_span<int32_t const> d_input);
 
 template rmm::device_uvector<int64_t> device_gatherv(raft::handle_t const& handle,
-                                                     int64_t const* d_input,
-                                                     size_t size);
+                                                     raft::device_span<int64_t const> d_input);
 
 template rmm::device_uvector<float> device_gatherv(raft::handle_t const& handle,
-                                                   float const* d_input,
-                                                   size_t size);
+                                                   raft::device_span<float const> d_input);
 
 template rmm::device_uvector<double> device_gatherv(raft::handle_t const& handle,
-                                                    double const* d_input,
-                                                    size_t size);
+                                                    raft::device_span<double const> d_input);
+
+template rmm::device_uvector<int32_t> device_allgatherv(raft::handle_t const& handle,
+                                                        raft::device_span<int32_t const> d_input);
+
+template rmm::device_uvector<int64_t> device_allgatherv(raft::handle_t const& handle,
+                                                        raft::device_span<int64_t const> d_input);
+
+template rmm::device_uvector<float> device_allgatherv(raft::handle_t const& handle,
+                                                      raft::device_span<float const> d_input);
+
+template rmm::device_uvector<double> device_allgatherv(raft::handle_t const& handle,
+                                                       raft::device_span<double const> d_input);
 
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/device_comm_wrapper.hpp b/cpp/tests/utilities/device_comm_wrapper.hpp
index 55145edd71b..c1d7b6b8250 100644
--- a/cpp/tests/utilities/device_comm_wrapper.hpp
+++ b/cpp/tests/utilities/device_comm_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,14 +16,35 @@
 
 #pragma once
 
+#include <raft/core/span.hpp>
 #include <raft/handle.hpp>
+
 #include <rmm/device_uvector.hpp>
 
 namespace cugraph {
 namespace test {
 
 template <typename T>
-rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size);
+rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle,
+                                      raft::device_span<T const> d_input);
+
+template <typename T>
+rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size)
+{
+  return device_gatherv(handle, raft::device_span<T const>{d_input, size});
+}
+
+template <typename T>
+rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
+                                         raft::device_span<T const> d_input);
+
+template <typename T>
+rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
+                                         T const* d_input,
+                                         size_t size)
+{
+  return device_allgatherv(handle, raft::device_span<T const>{d_input, size});
+}
 
 }  // namespace test
 }  // namespace cugraph
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
index 396b73afee5..cea48f5c420 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
@@ -183,7 +183,7 @@ cdef extern from "cugraph_c/algorithms.h":
         
     # uniform neighborhood sampling
     cdef cugraph_error_code_t \
-        cugraph_experimental_uniform_neighbor_sample(
+        cugraph_uniform_neighbor_sample(
             const cugraph_resource_handle_t* handle,
             cugraph_graph_t* graph,
             const cugraph_type_erased_device_array_view_t* start,
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index 8dfea32d821..e23a35396fb 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -37,7 +37,7 @@ from pylibcugraph._cugraph_c.graph cimport (
     cugraph_graph_t,
 )
 from pylibcugraph._cugraph_c.algorithms cimport (
-    cugraph_experimental_uniform_neighbor_sample,
+    cugraph_uniform_neighbor_sample,
     cugraph_sample_result_t,
     cugraph_sample_result_get_sources,
     cugraph_sample_result_get_destinations,
@@ -130,7 +130,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
             len(h_fan_out),
             get_c_type_from_numpy_type(h_fan_out.dtype))
 
-    error_code = cugraph_experimental_uniform_neighbor_sample(
+    error_code = cugraph_uniform_neighbor_sample(
         c_resource_handle_ptr,
         c_graph_ptr,
         start_ptr,