diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 87d26bfd848..63a91d4971f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -195,7 +195,6 @@ set(CUGRAPH_SOURCES
     src/utilities/path_retrieval.cu
     src/structure/legacy/graph.cu
     src/linear_assignment/legacy/hungarian.cu
-    src/traversal/legacy/bfs.cu
     src/link_prediction/legacy/jaccard.cu
     src/link_prediction/legacy/overlap.cu
     src/link_prediction/jaccard_sg.cu
@@ -234,7 +233,6 @@ set(CUGRAPH_SOURCES
     src/cores/k_core_sg.cu
     src/cores/k_core_mg.cu
     src/components/legacy/connectivity.cu
-    src/centrality/legacy/betweenness_centrality.cu
     src/generators/generate_rmat_edgelist.cu
     src/generators/generate_bipartite_rmat_edgelist.cu
     src/generators/generator_tools.cu
diff --git a/cpp/src/centrality/legacy/betweenness_centrality.cu b/cpp/src/centrality/legacy/betweenness_centrality.cu
deleted file mode 100644
index cd274a408e1..00000000000
--- a/cpp/src/centrality/legacy/betweenness_centrality.cu
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-#include <thrust/count.h>
-#include <thrust/extrema.h>
-#include <thrust/fill.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/replace.h>
-#include <thrust/transform.h>
-
-#include <raft/util/cudart_utils.hpp>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
-#include <cugraph/utilities/error.hpp>
-
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
-
-#include "betweenness_centrality.cuh"
-#include "betweenness_centrality_kernels.cuh"
-#include <raft/core/handle.hpp>
-
-namespace cugraph {
-namespace detail {
-namespace {
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality_impl(raft::handle_t const& handle,
-                                 legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                 result_t* result,
-                                 bool normalize,
-                                 bool endpoints,
-                                 weight_t const* weight,
-                                 vertex_t number_of_sources,
-                                 vertex_t const* sources,
-                                 vertex_t total_number_of_sources)
-{
-  // Current Implementation relies on BFS
-  // FIXME: For SSSP version
-  // Brandes Algorithm expects non negative weights for the accumulation
-  bool is_edge_betweenness = false;
-  verify_betweenness_centrality_input<vertex_t, edge_t, weight_t, result_t>(
-    result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources);
-  cugraph::detail::BC<vertex_t, edge_t, weight_t, result_t> bc(handle, graph);
-  bc.configure(
-    result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources);
-  bc.compute();
-  bc.rescale_by_total_sources_used(total_number_of_sources);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality_impl(raft::handle_t const& handle,
-                                      legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                      result_t* result,
-                                      bool normalize,
-                                      weight_t const* weight,
-                                      vertex_t number_of_sources,
-                                      vertex_t const* sources,
-                                      vertex_t /* total_number_of_sources */)
-{
-  // Current Implementation relies on BFS
-  // FIXME: For SSSP version
-  // Brandes Algorithm expects non negative weights for the accumulation
-  bool is_edge_betweenness = true;
-  bool endpoints           = false;
-  verify_betweenness_centrality_input<vertex_t, edge_t, weight_t, result_t>(
-    result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources);
-  cugraph::detail::BC<vertex_t, edge_t, weight_t, result_t> bc(handle, graph);
-  bc.configure(
-    result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources);
-  bc.compute();
-  // NOTE: As of 07/2020 NetworkX does not apply rescaling based on number
-  // of sources
-  // bc.rescale_by_total_sources_used(total_number_of_sources);
-}
-template <typename vertex_t>
-vertex_t get_total_number_of_sources(raft::handle_t const& handle, vertex_t local_number_of_sources)
-{
-  vertex_t total_number_of_sources_used = local_number_of_sources;
-  if (handle.comms_initialized()) {
-    rmm::device_scalar<vertex_t> d_number_of_sources(local_number_of_sources, handle.get_stream());
-    handle.get_comms().allreduce(d_number_of_sources.data(),
-                                 d_number_of_sources.data(),
-                                 1,
-                                 raft::comms::op_t::SUM,
-                                 handle.get_stream());
-    total_number_of_sources_used = d_number_of_sources.value(handle.get_stream());
-    // RAFT_CUDA_TRY(
-    // cudaMemcpy(&total_number_of_sources_used, data, sizeof(vertex_t), cudaMemcpyDeviceToHost));
-  }
-  return total_number_of_sources_used;
-}
-}  // namespace
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void verify_betweenness_centrality_input(result_t* result,
-                                         bool is_edge_betweenness,
-                                         bool normalize,
-                                         bool endpoints,
-                                         weight_t const* weights,
-                                         vertex_t const number_of_sources,
-                                         vertex_t const* sources)
-{
-  static_assert(std::is_same<vertex_t, int>::value, "vertex_t should be int");
-  static_assert(std::is_same<edge_t, int>::value, "edge_t should be int");
-  static_assert(std::is_same<weight_t, float>::value || std::is_same<weight_t, double>::value,
-                "weight_t should be float or double");
-  static_assert(std::is_same<result_t, float>::value || std::is_same<result_t, double>::value,
-                "result_t should be float or double");
-
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: betwenness pointer is NULL");
-  CUGRAPH_EXPECTS(number_of_sources >= 0, "Number of sources must be positive or equal to 0.");
-  if (number_of_sources != 0) {
-    CUGRAPH_EXPECTS(sources != nullptr,
-                    "Sources cannot be NULL if number_of_source is different from 0.");
-  }
-  if (is_edge_betweenness) {
-    CUGRAPH_EXPECTS(!endpoints, "Endpoints is not supported for edge betweenness centrality.");
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::setup()
-{
-  number_of_vertices_ = graph_.number_of_vertices;
-  number_of_edges_    = graph_.number_of_edges;
-  offsets_ptr_        = graph_.offsets;
-  indices_ptr_        = graph_.indices;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::configure(result_t* betweenness,
-                                                         bool is_edge_betweenness,
-                                                         bool normalized,
-                                                         bool endpoints,
-                                                         weight_t const* weights,
-                                                         vertex_t const* sources,
-                                                         vertex_t number_of_sources)
-{
-  // --- Bind betweenness output vector to internal ---
-  betweenness_         = betweenness;
-  normalized_          = normalized;
-  endpoints_           = endpoints;
-  sources_             = sources;
-  number_of_sources_   = number_of_sources;
-  edge_weights_ptr_    = weights;
-  is_edge_betweenness_ = is_edge_betweenness;
-
-  // --- Working data allocation ---
-  initialize_work_vectors();
-  initialize_pointers_to_vectors();
-
-  // --- Get Device Information ---
-  initialize_device_information();
-
-  // --- Confirm that configuration went through ---
-  configured_ = true;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_work_vectors()
-{
-  distances_vec_.resize(number_of_vertices_);
-  predecessors_vec_.resize(number_of_vertices_);
-  sp_counters_vec_.resize(number_of_vertices_);
-  deltas_vec_.resize(number_of_vertices_);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_pointers_to_vectors()
-{
-  distances_    = distances_vec_.data().get();
-  predecessors_ = predecessors_vec_.data().get();
-  sp_counters_  = sp_counters_vec_.data().get();
-  deltas_       = deltas_vec_.data().get();
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_device_information()
-{
-  max_grid_dim_1D_  = handle_.get_device_properties().maxGridSize[0];
-  max_block_dim_1D_ = handle_.get_device_properties().maxThreadsDim[0];
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::compute()
-{
-  CUGRAPH_EXPECTS(configured_, "BC must be configured before computation");
-  if (sources_) {
-    for (vertex_t source_idx = 0; source_idx < number_of_sources_; ++source_idx) {
-      vertex_t source_vertex = sources_[source_idx];
-      compute_single_source(source_vertex);
-    }
-  } else {
-    for (vertex_t source_vertex = 0; source_vertex < number_of_vertices_; ++source_vertex) {
-      compute_single_source(source_vertex);
-    }
-  }
-  rescale();
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::compute_single_source(vertex_t source_vertex)
-{
-  // Step 1) Singe-source shortest-path problem
-  cugraph::bfs(handle_,
-               graph_,
-               distances_,
-               predecessors_,
-               sp_counters_,
-               source_vertex,
-               graph_.prop.directed,
-               true);
-
-  // FIXME: Remove that with a BC specific class to gather
-  //        information during traversal
-
-  // Numeric max value is replaced by -1 as we look for the maximal depth of
-  // the traversal, this value is avalaible within the bfs implementation and
-  // there could be a way to access it directly and avoid both replace and the
-  // max
-  thrust::replace(handle_.get_thrust_policy(),
-                  distances_,
-                  distances_ + number_of_vertices_,
-                  std::numeric_limits<vertex_t>::max(),
-                  static_cast<vertex_t>(-1));
-  auto current_max_depth =
-    thrust::max_element(handle_.get_thrust_policy(), distances_, distances_ + number_of_vertices_);
-  vertex_t max_depth = 0;
-  RAFT_CUDA_TRY(
-    cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost));
-  // Step 2) Dependency accumulation
-  accumulate(source_vertex, max_depth);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate(vertex_t source_vertex,
-                                                          vertex_t max_depth)
-{
-  dim3 grid_configuration, block_configuration;
-  block_configuration.x = max_block_dim_1D_;
-  grid_configuration.x  = min(max_grid_dim_1D_, (number_of_edges_ / block_configuration.x + 1));
-
-  initialize_dependencies();
-
-  if (is_edge_betweenness_) {
-    accumulate_edges(max_depth, grid_configuration, block_configuration);
-  } else if (endpoints_) {
-    accumulate_vertices_with_endpoints(
-      source_vertex, max_depth, grid_configuration, block_configuration);
-  } else {
-    accumulate_vertices(max_depth, grid_configuration, block_configuration);
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_dependencies()
-{
-  thrust::fill(
-    handle_.get_thrust_policy(), deltas_, deltas_ + number_of_vertices_, static_cast<result_t>(0));
-}
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate_edges(vertex_t max_depth,
-                                                                dim3 grid_configuration,
-                                                                dim3 block_configuration)
-{
-  for (vertex_t depth = max_depth; depth >= 0; --depth) {
-    edges_accumulation_kernel<vertex_t, edge_t, weight_t, result_t>
-      <<<grid_configuration, block_configuration, 0, handle_.get_stream()>>>(betweenness_,
-                                                                             number_of_vertices_,
-                                                                             graph_.indices,
-                                                                             graph_.offsets,
-                                                                             distances_,
-                                                                             sp_counters_,
-                                                                             deltas_,
-                                                                             depth);
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate_vertices_with_endpoints(
-  vertex_t source_vertex, vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration)
-{
-  for (vertex_t depth = max_depth; depth > 0; --depth) {
-    endpoints_accumulation_kernel<vertex_t, edge_t, weight_t, result_t>
-      <<<grid_configuration, block_configuration, 0, handle_.get_stream()>>>(betweenness_,
-                                                                             number_of_vertices_,
-                                                                             graph_.indices,
-                                                                             graph_.offsets,
-                                                                             distances_,
-                                                                             sp_counters_,
-                                                                             deltas_,
-                                                                             depth);
-  }
-  add_reached_endpoints_to_source_betweenness(source_vertex);
-  add_vertices_dependencies_to_betweenness();
-}
-
-// Distances should contain -1 for unreached nodes,
-
-// FIXME: There might be a cleaner way to add a value to a single
-//        score in the betweenness vector
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::add_reached_endpoints_to_source_betweenness(
-  vertex_t source_vertex)
-{
-  vertex_t number_of_unvisited_vertices =
-    thrust::count(handle_.get_thrust_policy(), distances_, distances_ + number_of_vertices_, -1);
-  vertex_t number_of_visited_vertices_except_source =
-    number_of_vertices_ - number_of_unvisited_vertices - 1;
-  rmm::device_vector<vertex_t> buffer(1);
-  buffer[0] = number_of_visited_vertices_except_source;
-  thrust::transform(handle_.get_thrust_policy(),
-                    buffer.begin(),
-                    buffer.end(),
-                    betweenness_ + source_vertex,
-                    betweenness_ + source_vertex,
-                    thrust::plus<result_t>());
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::add_vertices_dependencies_to_betweenness()
-{
-  thrust::transform(handle_.get_thrust_policy(),
-                    deltas_,
-                    deltas_ + number_of_vertices_,
-                    betweenness_,
-                    betweenness_,
-                    thrust::plus<result_t>());
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate_vertices(vertex_t max_depth,
-                                                                   dim3 grid_configuration,
-                                                                   dim3 block_configuration)
-{
-  for (vertex_t depth = max_depth; depth > 0; --depth) {
-    accumulation_kernel<vertex_t, edge_t, weight_t, result_t>
-      <<<grid_configuration, block_configuration, 0, handle_.get_stream()>>>(betweenness_,
-                                                                             number_of_vertices_,
-                                                                             graph_.indices,
-                                                                             graph_.offsets,
-                                                                             distances_,
-                                                                             sp_counters_,
-                                                                             deltas_,
-                                                                             depth);
-  }
-  add_vertices_dependencies_to_betweenness();
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::rescale()
-{
-  bool modified           = false;
-  result_t rescale_factor = static_cast<result_t>(1);
-  if (normalized_) {
-    if (is_edge_betweenness_) {
-      std::tie(rescale_factor, modified) =
-        rescale_edges_betweenness_centrality(rescale_factor, modified);
-    } else {
-      std::tie(rescale_factor, modified) =
-        rescale_vertices_betweenness_centrality(rescale_factor, modified);
-    }
-  } else {
-    if (!graph_.prop.directed) {
-      rescale_factor /= static_cast<result_t>(2);
-      modified = true;
-    }
-  }
-  apply_rescale_factor_to_betweenness(rescale_factor);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-std::tuple<result_t, bool>
-BC<vertex_t, edge_t, weight_t, result_t>::rescale_edges_betweenness_centrality(
-  result_t rescale_factor, bool modified)
-{
-  result_t casted_number_of_vertices_ = static_cast<result_t>(number_of_vertices_);
-  if (number_of_vertices_ > 1) {
-    rescale_factor /= ((casted_number_of_vertices_) * (casted_number_of_vertices_ - 1));
-    modified = true;
-  }
-  return std::make_tuple(rescale_factor, modified);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-std::tuple<result_t, bool>
-BC<vertex_t, edge_t, weight_t, result_t>::rescale_vertices_betweenness_centrality(
-  result_t rescale_factor, bool modified)
-{
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices_);
-  if (number_of_vertices_ > 2) {
-    if (endpoints_) {
-      rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1));
-    } else {
-      rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2));
-    }
-    modified = true;
-  }
-  return std::make_tuple(rescale_factor, modified);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::apply_rescale_factor_to_betweenness(
-  result_t rescale_factor)
-{
-  size_t result_size = number_of_vertices_;
-  if (is_edge_betweenness_) result_size = number_of_edges_;
-  thrust::transform(handle_.get_thrust_policy(),
-                    betweenness_,
-                    betweenness_ + result_size,
-                    thrust::make_constant_iterator(rescale_factor),
-                    betweenness_,
-                    thrust::multiplies<result_t>());
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::rescale_by_total_sources_used(
-  vertex_t total_number_of_sources_used)
-{
-  result_t rescale_factor = static_cast<result_t>(1);
-  result_t casted_total_number_of_sources_used =
-    static_cast<result_t>(total_number_of_sources_used);
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices_);
-
-  if (normalized_) {
-    if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) {
-      rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used);
-    }
-  } else if (!graph_.prop.directed) {
-    if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) {
-      rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used);
-    }
-  }
-  apply_rescale_factor_to_betweenness(rescale_factor);
-}
-}  // namespace detail
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality(raft::handle_t const& handle,
-                            legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                            result_t* result,
-                            bool normalize,
-                            bool endpoints,
-                            weight_t const* weight,
-                            vertex_t k,
-                            vertex_t const* vertices)
-{
-  vertex_t total_number_of_sources_used = detail::get_total_number_of_sources<vertex_t>(handle, k);
-  if (handle.comms_initialized()) {
-    rmm::device_vector<result_t> betweenness(graph.number_of_vertices, 0);
-    detail::betweenness_centrality_impl(handle,
-                                        graph,
-                                        betweenness.data().get(),
-                                        normalize,
-                                        endpoints,
-                                        weight,
-                                        k,
-                                        vertices,
-                                        total_number_of_sources_used);
-    handle.get_comms().reduce(betweenness.data().get(),
-                              result,
-                              betweenness.size(),
-                              raft::comms::op_t::SUM,
-                              0,
-                              handle.get_stream());
-  } else {
-    detail::betweenness_centrality_impl(handle,
-                                        graph,
-                                        result,
-                                        normalize,
-                                        endpoints,
-                                        weight,
-                                        k,
-                                        vertices,
-                                        total_number_of_sources_used);
-  }
-}
-
-template void betweenness_centrality<int, int, float, float>(
-  const raft::handle_t&,
-  legacy::GraphCSRView<int, int, float> const&,
-  float*,
-  bool,
-  bool,
-  float const*,
-  int,
-  int const*);
-template void betweenness_centrality<int, int, double, double>(
-  const raft::handle_t&,
-  legacy::GraphCSRView<int, int, double> const&,
-  double*,
-  bool,
-  bool,
-  double const*,
-  int,
-  int const*);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality(raft::handle_t const& handle,
-                                 legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                 result_t* result,
-                                 bool normalize,
-                                 weight_t const* weight,
-                                 vertex_t k,
-                                 vertex_t const* vertices)
-{
-  vertex_t total_number_of_sources_used = detail::get_total_number_of_sources<vertex_t>(handle, k);
-  if (handle.comms_initialized()) {
-    rmm::device_vector<result_t> betweenness(graph.number_of_edges, 0);
-    detail::edge_betweenness_centrality_impl(handle,
-                                             graph,
-                                             betweenness.data().get(),
-                                             normalize,
-                                             weight,
-                                             k,
-                                             vertices,
-                                             total_number_of_sources_used);
-    handle.get_comms().reduce(betweenness.data().get(),
-                              result,
-                              betweenness.size(),
-                              raft::comms::op_t::SUM,
-                              0,
-                              handle.get_stream());
-  } else {
-    detail::edge_betweenness_centrality_impl(
-      handle, graph, result, normalize, weight, k, vertices, total_number_of_sources_used);
-  }
-}
-
-template void edge_betweenness_centrality<int, int, float, float>(
-  const raft::handle_t&,
-  legacy::GraphCSRView<int, int, float> const&,
-  float*,
-  bool,
-  float const*,
-  int,
-  int const*);
-
-template void edge_betweenness_centrality<int, int, double, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int, int, double> const&,
-  double*,
-  bool,
-  double const*,
-  int,
-  int const*);
-}  // namespace cugraph
diff --git a/cpp/src/centrality/legacy/betweenness_centrality.cuh b/cpp/src/centrality/legacy/betweenness_centrality.cuh
deleted file mode 100644
index 43f095d634f..00000000000
--- a/cpp/src/centrality/legacy/betweenness_centrality.cuh
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Xavier Cadet xcadet@nvidia.com
-
-#pragma once
-#include <rmm/device_vector.hpp>
-
-namespace cugraph {
-namespace detail {
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality(raft::handle_t const& handle,
-                            legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                            result_t* result,
-                            bool normalize,
-                            bool endpoints,
-                            weight_t const* weight,
-                            vertex_t const number_of_sources,
-                            vertex_t const* sources);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality(legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                 result_t* result,
-                                 bool normalize,
-                                 weight_t const* weight,
-                                 vertex_t const number_of_sources,
-                                 vertex_t const* sources);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void verify_betweenness_centrality_input(result_t* result,
-                                         bool is_edge_betweenness,
-                                         bool normalize,
-                                         bool endpoints,
-                                         weight_t const* weights,
-                                         vertex_t const number_of_sources,
-                                         vertex_t const* sources);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-class BC {
- public:
-  virtual ~BC(void) {}
-  BC(raft::handle_t const& handle,
-     legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-     cudaStream_t stream = 0)
-    : handle_(handle), graph_(graph)
-  {
-    setup();
-  }
-  void configure(result_t* betweenness,
-                 bool is_edge_betweenness,
-                 bool normalize,
-                 bool endpoints,
-                 weight_t const* weight,
-                 vertex_t const* sources,
-                 vertex_t const number_of_sources);
-
-  void configure_edge(result_t* betweenness,
-                      bool normalize,
-                      weight_t const* weight,
-                      vertex_t const* sources,
-                      vertex_t const number_of_sources);
-  void compute();
-  void rescale_by_total_sources_used(vertex_t total_number_of_sources_used);
-
- private:
-  // --- RAFT handle ---
-  raft::handle_t const& handle_;
-  // --- Information concerning the graph ---
-  const legacy::GraphCSRView<vertex_t, edge_t, weight_t>& graph_;
-  // --- These information are extracted on setup ---
-  vertex_t number_of_vertices_;  // Number of vertices in the graph
-  vertex_t number_of_edges_;     // Number of edges in the graph
-  edge_t const* offsets_ptr_;    // Pointer to the offsets
-  vertex_t const* indices_ptr_;  // Pointers to the indices
-
-  // --- Information from configuration ---
-  bool configured_          = false;  // Flag to ensure configuration was called
-  bool normalized_          = false;  // If True normalize the betweenness
-  bool is_edge_betweenness_ = false;  // If True compute edge_betweeness
-
-  // FIXME: For weighted version
-  weight_t const* edge_weights_ptr_ = nullptr;  // Pointer to the weights
-  bool endpoints_                   = false;    // If True normalize the betweenness
-  vertex_t const* sources_          = nullptr;  // Subset of vertices to gather information from
-  vertex_t number_of_sources_;                  // Number of vertices in sources
-
-  // --- Output ----
-  // betweenness is set/read by users - using Vectors
-  result_t* betweenness_ = nullptr;
-
-  // --- Data required to perform computation ----
-  rmm::device_vector<vertex_t> distances_vec_;
-  rmm::device_vector<vertex_t> predecessors_vec_;
-  rmm::device_vector<double> sp_counters_vec_;
-  rmm::device_vector<double> deltas_vec_;
-
-  vertex_t* distances_ =
-    nullptr;  // array<vertex_t>(|V|) stores the distances gathered by the latest SSSP
-  vertex_t* predecessors_ =
-    nullptr;  // array<weight_t>(|V|) stores the predecessors of the latest SSSP
-  double* sp_counters_ =
-    nullptr;  // array<vertex_t>(|V|) stores the shortest path counter for the latest SSSP
-  double* deltas_ = nullptr;  // array<result_t>(|V|) stores the dependencies for the latest SSSP
-
-  int max_grid_dim_1D_  = 0;
-  int max_block_dim_1D_ = 0;
-
-  void setup();
-
-  void initialize_work_vectors();
-  void initialize_pointers_to_vectors();
-  void initialize_device_information();
-
-  void compute_single_source(vertex_t source_vertex);
-
-  void accumulate(vertex_t source_vertex, vertex_t max_depth);
-  void initialize_dependencies();
-  void accumulate_edges(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration);
-  void accumulate_vertices_with_endpoints(vertex_t source_vertex,
-                                          vertex_t max_depth,
-                                          dim3 grid_configuration,
-                                          dim3 block_configuration);
-  void accumulate_vertices(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration);
-  void add_reached_endpoints_to_source_betweenness(vertex_t source_vertex);
-  void add_vertices_dependencies_to_betweenness();
-
-  void rescale();
-  std::tuple<result_t, bool> rescale_vertices_betweenness_centrality(result_t rescale_factor,
-                                                                     bool modified);
-  std::tuple<result_t, bool> rescale_edges_betweenness_centrality(result_t rescale_factor,
-                                                                  bool modified);
-  void apply_rescale_factor_to_betweenness(result_t scaling_factor);
-};
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh b/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh
deleted file mode 100644
index b0ccb669376..00000000000
--- a/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace cugraph {
-namespace detail {
-// Dependecy Accumulation: based on McLaughlin and Bader, 2018
-// FIXME: Accumulation kernel mights not scale well, as each thread is handling
-//        all the edges for each node, an approach similar to the traversal
-//        bucket (i.e. BFS / SSSP) system might enable speed up.
-//        Should look into forAllEdge type primitive for different
-//        load balancing
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void edges_accumulation_kernel(result_t* betweenness,
-                                          vertex_t number_vertices,
-                                          vertex_t const* indices,
-                                          edge_t const* offsets,
-                                          vertex_t* distances,
-                                          double* sp_counters,
-                                          double* deltas,
-                                          vertex_t depth)
-{
-  for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
-       thread_idx += gridDim.x * blockDim.x) {
-    vertex_t vertex     = thread_idx;
-    double vertex_delta = 0;
-    double vertex_sigma = sp_counters[vertex];
-    if (distances[vertex] == depth) {
-      edge_t first_edge_idx = offsets[vertex];
-      edge_t last_edge_idx  = offsets[vertex + 1];
-      for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) {
-        vertex_t successor = indices[edge_idx];
-        if (distances[successor] == distances[vertex] + 1) {
-          double factor = (static_cast<double>(1) + deltas[successor]) / sp_counters[successor];
-          double coefficient = vertex_sigma * factor;
-
-          vertex_delta += coefficient;
-          betweenness[edge_idx] += coefficient;
-        }
-      }
-      deltas[vertex] = vertex_delta;
-    }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void endpoints_accumulation_kernel(result_t* betweenness,
-                                              vertex_t number_vertices,
-                                              vertex_t const* indices,
-                                              edge_t const* offsets,
-                                              vertex_t* distances,
-                                              double* sp_counters,
-                                              double* deltas,
-                                              vertex_t depth)
-{
-  for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
-       thread_idx += gridDim.x * blockDim.x) {
-    vertex_t vertex     = thread_idx;
-    double vertex_delta = 0;
-    double vertex_sigma = sp_counters[vertex];
-    if (distances[vertex] == depth) {
-      edge_t first_edge_idx = offsets[vertex];
-      edge_t last_edge_idx  = offsets[vertex + 1];
-      for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) {
-        vertex_t successor = indices[edge_idx];
-        if (distances[successor] == distances[vertex] + 1) {
-          double factor = (static_cast<double>(1) + deltas[successor]) / sp_counters[successor];
-          vertex_delta += vertex_sigma * factor;
-        }
-      }
-      betweenness[vertex] += 1;
-      deltas[vertex] = vertex_delta;
-    }
-  }
-}
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void accumulation_kernel(result_t* betweenness,
-                                    vertex_t number_vertices,
-                                    vertex_t const* indices,
-                                    edge_t const* offsets,
-                                    vertex_t* distances,
-                                    double* sp_counters,
-                                    double* deltas,
-                                    vertex_t depth)
-{
-  for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
-       thread_idx += gridDim.x * blockDim.x) {
-    vertex_t vertex     = thread_idx;
-    double vertex_delta = 0;
-    double vertex_sigma = sp_counters[vertex];
-    if (distances[vertex] == depth) {
-      edge_t first_edge_idx = offsets[vertex];
-      edge_t last_edge_idx  = offsets[vertex + 1];
-      for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) {
-        vertex_t successor = indices[edge_idx];
-        if (distances[successor] == distances[vertex] + 1) {
-          double factor = (static_cast<double>(1) + deltas[successor]) / sp_counters[successor];
-          vertex_delta += vertex_sigma * factor;
-        }
-      }
-      deltas[vertex] = vertex_delta;
-    }
-  }
-}
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/bfs.cu b/cpp/src/traversal/legacy/bfs.cu
deleted file mode 100644
index a0fb11a98d9..00000000000
--- a/cpp/src/traversal/legacy/bfs.cu
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-#include "bfs.cuh"
-#include <algorithm>
-#include <iomanip>
-#include <limits>
-
-#include <cugraph/legacy/graph.hpp>
-
-#include "bfs_kernels.cuh"
-#include "traversal_common.cuh"
-#include <cugraph/utilities/error.hpp>
-#include <utilities/graph_utils.cuh>
-
-namespace cugraph {
-namespace detail {
-enum BFS_ALGO_STATE { TOPDOWN, BOTTOMUP };
-
-template <typename IndexType>
-void BFS<IndexType>::setup()
-{
-  // Determinism flag, false by default
-  deterministic = false;
-
-  // Working data
-  // Each vertex can be in the frontier at most once
-  // We will update frontier during the execution
-  // We need the orig to reset frontier, or ALLOC_FREE_TRY
-  original_frontier.resize(number_of_vertices);
-  frontier = original_frontier.data().get();
-
-  // size of bitmaps for vertices
-  vertices_bmap_size = (number_of_vertices / (8 * sizeof(int)) + 1);
-  // ith bit of visited_bmap is set <=> ith vertex is visited
-
-  visited_bmap.resize(vertices_bmap_size);
-
-  // ith bit of isolated_bmap is set <=> degree of ith vertex = 0
-  isolated_bmap.resize(vertices_bmap_size);
-
-  // vertices_degree[i] = degree of vertex i
-  vertex_degree.resize(number_of_vertices);
-
-  // We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it
-  // since those uses are mutually exclusive
-  buffer_np1_1.resize(number_of_vertices + 1);
-  buffer_np1_2.resize(number_of_vertices + 1);
-
-  // Using buffers : top down
-
-  // frontier_vertex_degree[i] is the degree of vertex frontier[i]
-  frontier_vertex_degree = buffer_np1_1.data().get();
-  // exclusive sum of frontier_vertex_degree
-  exclusive_sum_frontier_vertex_degree = buffer_np1_2.data().get();
-
-  // Using buffers : bottom up
-  // contains list of unvisited vertices
-  unvisited_queue = buffer_np1_1.data().get();
-  // size of the "last" unvisited queue : size_last_unvisited_queue
-  // refers to the size of unvisited_queue
-  // which may not be up to date (the queue may contains vertices that are now
-  // visited)
-
-  // We may leave vertices unvisited after bottom up main kernels - storing them
-  // here
-  left_unvisited_queue = buffer_np1_2.data().get();
-
-  // We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels).
-  // frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of
-  // the first edge of the bucket See top down kernels for more details
-  exclusive_sum_frontier_vertex_buckets_offsets.resize(
-    ((number_of_edges / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2));
-
-  // Init device-side counters
-  // Those counters must be/can be reset at each bfs iteration
-  // Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the
-  // current bottleneck
-  d_counters_pad.resize(4);
-
-  d_new_frontier_cnt   = d_counters_pad.data().get();
-  d_mu                 = d_counters_pad.data().get() + 1;
-  d_unvisited_cnt      = d_counters_pad.data().get() + 2;
-  d_left_unvisited_cnt = d_counters_pad.data().get() + 3;
-
-  // Lets use this int* for the next 3 lines
-  // Its dereferenced value is not initialized - so we dont care about what we
-  // put in it
-  IndexType* d_nisolated = d_new_frontier_cnt;
-  cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
-
-  // Computing isolated_bmap
-  // Only dependent on graph - not source vertex - done once
-  traversal::flag_isolated_vertices(number_of_vertices,
-                                    isolated_bmap.data().get(),
-                                    row_offsets,
-                                    vertex_degree.data().get(),
-                                    d_nisolated,
-                                    stream);
-  cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-
-  // We need nisolated to be ready to use
-  cudaStreamSynchronize(stream);
-}
-
-template <typename IndexType>
-void BFS<IndexType>::configure(IndexType* _distances,
-                               IndexType* _predecessors,
-                               double* _sp_counters,
-                               int* _edge_mask)
-{
-  distances    = _distances;
-  predecessors = _predecessors;
-  edge_mask    = _edge_mask;
-  sp_counters  = _sp_counters;
-
-  useEdgeMask         = (edge_mask != NULL);
-  computeDistances    = (distances != NULL);
-  computePredecessors = (predecessors != NULL);
-
-  // We need distances to use bottom up
-  if (directed && !computeDistances) {
-    distances_vals.resize(number_of_vertices);
-    distances = distances_vals.data().get();
-  }
-
-  // In case the shortest path counters is required, previous_bmap has to be allocated
-  if (sp_counters) { previous_visited_bmap.resize(vertices_bmap_size); }
-}
-
-template <typename IndexType>
-void BFS<IndexType>::traverse(IndexType source_vertex)
-{
-  // Init visited_bmap
-  // If the graph is undirected, we not that
-  // we will never discover isolated vertices (in degree = out degree = 0)
-  // we avoid a lot of work by flagging them now
-  // in g500 graphs they represent ~25% of total vertices
-  // more than that for wiki and twitter graphs
-
-  if (directed) {
-    cudaMemsetAsync(visited_bmap.data().get(), 0, vertices_bmap_size * sizeof(int), stream);
-  } else {
-    cudaMemcpyAsync(visited_bmap.data().get(),
-                    isolated_bmap.data().get(),
-                    vertices_bmap_size * sizeof(int),
-                    cudaMemcpyDeviceToDevice,
-                    stream);
-  }
-
-  // If needed, setting all vertices as undiscovered (inf distance)
-  // We dont use computeDistances here
-  // if the graph is undirected, we may need distances even if
-  // computeDistances is false
-  if (distances)
-    traversal::fill_vec(distances, number_of_vertices, traversal::vec_t<IndexType>::max, stream);
-
-  // If needed, setting all predecessors to non-existent (-1)
-  if (computePredecessors) {
-    cudaMemsetAsync(predecessors, -1, number_of_vertices * sizeof(IndexType), stream);
-  }
-
-  if (sp_counters) {
-    cudaMemsetAsync(sp_counters, 0, number_of_vertices * sizeof(double), stream);
-    double value = 1;
-    cudaMemcpyAsync(sp_counters + source_vertex, &value, sizeof(double), cudaMemcpyHostToDevice);
-  }
-
-  //
-  // Initial frontier
-  //
-
-  frontier = original_frontier.data().get();
-
-  if (distances) { cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream); }
-
-  // Setting source_vertex as visited
-  // There may be bit already set on that bmap (isolated vertices) - if the
-  // graph is undirected
-  int current_visited_bmap_source_vert = 0;
-
-  if (!directed) {
-    cudaMemcpyAsync(&current_visited_bmap_source_vert,
-                    visited_bmap.data().get() + (source_vertex / INT_SIZE),
-                    sizeof(int),
-                    cudaMemcpyDeviceToHost);
-    // We need current_visited_bmap_source_vert
-    cudaStreamSynchronize(stream);
-  }
-
-  int m = (1 << (source_vertex % INT_SIZE));
-
-  // In that case, source is isolated, done now
-  if (!directed && (m & current_visited_bmap_source_vert)) {
-    // Init distances and predecessors are done, (cf Streamsync in previous if)
-    return;
-  }
-
-  m |= current_visited_bmap_source_vert;
-
-  cudaMemcpyAsync(visited_bmap.data().get() + (source_vertex / INT_SIZE),
-                  &m,
-                  sizeof(int),
-                  cudaMemcpyHostToDevice,
-                  stream);
-
-  // Adding source_vertex to init frontier
-  cudaMemcpyAsync(&frontier[0], &source_vertex, sizeof(IndexType), cudaMemcpyHostToDevice, stream);
-
-  // mf : edges in frontier
-  // nf : vertices in frontier
-  // mu : edges undiscovered
-  // nu : nodes undiscovered
-  // lvl : current frontier's depth
-  IndexType mf, nf, mu, nu;
-  bool growing;
-  IndexType lvl = 1;
-
-  // Frontier has one vertex
-  nf = 1;
-
-  // all edges are undiscovered (by def isolated vertices have 0 edges)
-  mu = number_of_edges;
-
-  // all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
-  // That number is wrong if source_vertex is also isolated - but it's not important
-  nu = number_of_vertices - nisolated - nf;
-
-  // Last frontier was 0, now it is 1
-  growing = true;
-
-  IndexType size_last_left_unvisited_queue = number_of_vertices;  // we just need value > 0
-  IndexType size_last_unvisited_queue      = 0;                   // queue empty
-
-  // Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-  traversal::set_frontier_degree(
-    frontier_vertex_degree, frontier, vertex_degree.data().get(), nf, stream);
-  traversal::exclusive_sum(
-    frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream);
-
-  cudaMemcpyAsync(&mf,
-                  &exclusive_sum_frontier_vertex_degree[nf],
-                  sizeof(IndexType),
-                  cudaMemcpyDeviceToHost,
-                  stream);
-
-  // We need mf
-  cudaStreamSynchronize(stream);
-
-  // At first we know we have to use top down
-  BFS_ALGO_STATE algo_state = TOPDOWN;
-
-  // useDistances : we check if a vertex is a parent using distances in bottom up - distances become
-  // working data undirected g : need parents to be in children's neighbors
-
-  // In case the shortest path counters need to be computeed, the bottom_up approach cannot be used
-  // bool can_use_bottom_up = (!sp_counters && !directed && distances);
-  bool can_use_bottom_up = false;
-
-  while (nf > 0) {
-    new_frontier     = frontier + nf;
-    IndexType old_nf = nf;
-    resetDevicePointers();
-
-    if (can_use_bottom_up) {
-      // Choosing algo
-      // Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
-
-      switch (algo_state) {
-        case TOPDOWN:
-          if (mf > mu / alpha) algo_state = BOTTOMUP;
-          break;
-        case BOTTOMUP:
-          if (!growing && nf < number_of_vertices / beta) {
-            // We need to prepare the switch back to top down
-            // We couldnt keep track of mu during bottom up - because we dont know what mf is.
-            // Computing mu here
-            bfs_kernels::count_unvisited_edges(unvisited_queue,
-                                               size_last_unvisited_queue,
-                                               visited_bmap.data().get(),
-                                               vertex_degree.data().get(),
-                                               d_mu,
-                                               stream);
-
-            // Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-            traversal::set_frontier_degree(
-              frontier_vertex_degree, frontier, vertex_degree.data().get(), nf, stream);
-            traversal::exclusive_sum(
-              frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream);
-
-            cudaMemcpyAsync(&mf,
-                            &exclusive_sum_frontier_vertex_degree[nf],
-                            sizeof(IndexType),
-                            cudaMemcpyDeviceToHost,
-                            stream);
-
-            cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-
-            // We will need mf and mu
-            cudaStreamSynchronize(stream);
-            algo_state = TOPDOWN;
-          }
-          break;
-      }
-    }
-
-    // Executing algo
-
-    switch (algo_state) {
-      case TOPDOWN:
-        // This step is only required if sp_counters is not nullptr
-        if (sp_counters) {
-          cudaMemcpyAsync(previous_visited_bmap.data().get(),
-                          visited_bmap.data().get(),
-                          vertices_bmap_size * sizeof(int),
-                          cudaMemcpyDeviceToDevice,
-                          stream);
-          // We need to copy the visited_bmap before doing the traversal
-          cudaStreamSynchronize(stream);
-        }
-        traversal::compute_bucket_offsets(
-          exclusive_sum_frontier_vertex_degree,
-          exclusive_sum_frontier_vertex_buckets_offsets.data().get(),
-          nf,
-          mf,
-          stream);
-        bfs_kernels::frontier_expand(row_offsets,
-                                     col_indices,
-                                     frontier,
-                                     nf,
-                                     mf,
-                                     lvl,
-                                     new_frontier,
-                                     d_new_frontier_cnt,
-                                     exclusive_sum_frontier_vertex_degree,
-                                     exclusive_sum_frontier_vertex_buckets_offsets.data().get(),
-                                     previous_visited_bmap.data().get(),
-                                     visited_bmap.data().get(),
-                                     distances,
-                                     predecessors,
-                                     sp_counters,
-                                     edge_mask,
-                                     isolated_bmap.data().get(),
-                                     directed,
-                                     stream,
-                                     deterministic);
-
-        mu -= mf;
-
-        cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-        RAFT_CHECK_CUDA(stream);
-
-        // We need nf
-        cudaStreamSynchronize(stream);
-
-        if (nf) {
-          // Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-          traversal::set_frontier_degree(
-            frontier_vertex_degree, new_frontier, vertex_degree.data().get(), nf, stream);
-          traversal::exclusive_sum(
-            frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream);
-          cudaMemcpyAsync(&mf,
-                          &exclusive_sum_frontier_vertex_degree[nf],
-                          sizeof(IndexType),
-                          cudaMemcpyDeviceToHost,
-                          stream);
-
-          // We need mf
-          cudaStreamSynchronize(stream);
-        }
-        break;
-
-      case BOTTOMUP:
-        bfs_kernels::fill_unvisited_queue(visited_bmap.data().get(),
-                                          vertices_bmap_size,
-                                          number_of_vertices,
-                                          unvisited_queue,
-                                          d_unvisited_cnt,
-                                          stream,
-                                          deterministic);
-
-        size_last_unvisited_queue = nu;
-
-        bfs_kernels::bottom_up_main(unvisited_queue,
-                                    size_last_unvisited_queue,
-                                    left_unvisited_queue,
-                                    d_left_unvisited_cnt,
-                                    visited_bmap.data().get(),
-                                    row_offsets,
-                                    col_indices,
-                                    lvl,
-                                    new_frontier,
-                                    d_new_frontier_cnt,
-                                    distances,
-                                    predecessors,
-                                    edge_mask,
-                                    stream,
-                                    deterministic);
-
-        // The number of vertices left unvisited decreases
-        // If it wasnt necessary last time, it wont be this time
-        if (size_last_left_unvisited_queue) {
-          cudaMemcpyAsync(&size_last_left_unvisited_queue,
-                          d_left_unvisited_cnt,
-                          sizeof(IndexType),
-                          cudaMemcpyDeviceToHost,
-                          stream);
-          RAFT_CHECK_CUDA(stream);
-          // We need last_left_unvisited_size
-          cudaStreamSynchronize(stream);
-          bfs_kernels::bottom_up_large(left_unvisited_queue,
-                                       size_last_left_unvisited_queue,
-                                       visited_bmap.data().get(),
-                                       row_offsets,
-                                       col_indices,
-                                       lvl,
-                                       new_frontier,
-                                       d_new_frontier_cnt,
-                                       distances,
-                                       predecessors,
-                                       edge_mask,
-                                       stream,
-                                       deterministic);
-        }
-        cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-        RAFT_CHECK_CUDA(stream);
-
-        // We will need nf
-        cudaStreamSynchronize(stream);
-        break;
-    }
-
-    // Updating undiscovered edges count
-    nu -= nf;
-
-    // Using new frontier
-    frontier = new_frontier;
-    growing  = (nf > old_nf);
-
-    ++lvl;
-  }
-}
-
-template <typename IndexType>
-void BFS<IndexType>::resetDevicePointers()
-{
-  cudaMemsetAsync(d_counters_pad.data().get(), 0, 4 * sizeof(IndexType), stream);
-}
-
-template <typename IndexType>
-void BFS<IndexType>::clean()
-{
-  // the vectors have a destructor that takes care of cleaning
-}
-
-// Explicit Instantiation
-template class BFS<uint32_t>;
-template class BFS<int>;
-template class BFS<int64_t>;
-
-}  // namespace detail
-
-// NOTE: SP counter increase extremely fast on large graph
-//       It can easily reach 1e40~1e70 on GAP-road.mtx
-template <typename VT, typename ET, typename WT>
-void bfs(raft::handle_t const& handle,
-         legacy::GraphCSRView<VT, ET, WT> const& graph,
-         VT* distances,
-         VT* predecessors,
-         double* sp_counters,
-         const VT start_vertex,
-         bool directed,
-         bool mg_batch)
-{
-  static_assert(std::is_integral<VT>::value && sizeof(VT) >= sizeof(int32_t),
-                "Unsupported vertex id data type. Use integral types of size >= sizeof(int32_t)");
-  static_assert(std::is_same<VT, ET>::value,
-                "VT and ET should be the same time for the current BFS implementation");
-  static_assert(std::is_floating_point<WT>::value,
-                "Unsupported edge weight type. Use floating point types");  // actually, this is
-                                                                            // unnecessary for BFS
-  if (handle.comms_initialized() && !mg_batch) {
-    CUGRAPH_FAIL("NO LONGER SUPPORTED");
-  } else {
-    VT number_of_vertices = graph.number_of_vertices;
-    ET number_of_edges    = graph.number_of_edges;
-
-    const VT* indices_ptr = graph.indices;
-    const ET* offsets_ptr = graph.offsets;
-
-    int alpha = 15;
-    int beta  = 18;
-    // FIXME: Use VT and ET in the BFS detail
-    cugraph::detail::BFS<VT> bfs(
-      number_of_vertices, number_of_edges, offsets_ptr, indices_ptr, directed, alpha, beta);
-    bfs.configure(distances, predecessors, sp_counters, nullptr);
-    bfs.traverse(start_vertex);
-  }
-}
-
-// Explicit Instantiation
-template void bfs<uint32_t, uint32_t, float>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<uint32_t, uint32_t, float> const& graph,
-  uint32_t* distances,
-  uint32_t* predecessors,
-  double* sp_counters,
-  const uint32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<uint32_t, uint32_t, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<uint32_t, uint32_t, double> const& graph,
-  uint32_t* distances,
-  uint32_t* predecessors,
-  double* sp_counters,
-  const uint32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int32_t, int32_t, float>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int32_t, int32_t, float> const& graph,
-  int32_t* distances,
-  int32_t* predecessors,
-  double* sp_counters,
-  const int32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int32_t, int32_t, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int32_t, int32_t, double> const& graph,
-  int32_t* distances,
-  int32_t* predecessors,
-  double* sp_counters,
-  const int32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int64_t, int64_t, float>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int64_t, int64_t, float> const& graph,
-  int64_t* distances,
-  int64_t* predecessors,
-  double* sp_counters,
-  const int64_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int64_t, int64_t, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int64_t, int64_t, double> const& graph,
-  int64_t* distances,
-  int64_t* predecessors,
-  double* sp_counters,
-  const int64_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/bfs.cuh b/cpp/src/traversal/legacy/bfs.cuh
deleted file mode 100644
index dd636a2c97c..00000000000
--- a/cpp/src/traversal/legacy/bfs.cuh
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-#pragma once
-
-#include <climits>
-#include <rmm/device_vector.hpp>
-
-#define TRAVERSAL_DEFAULT_ALPHA 15
-
-#define TRAVERSAL_DEFAULT_BETA 18
-
-namespace cugraph {
-namespace detail {
-// FIXME: Differentiate IndexType for vertices and edges
-template <typename IndexType>
-class BFS {
- private:
-  IndexType number_of_vertices, number_of_edges;
-  const IndexType* row_offsets = nullptr;
-  const IndexType* col_indices = nullptr;
-
-  bool directed;
-  bool deterministic;
-
-  // edgemask, distances, predecessors are set/read by users - using Vectors
-  bool useEdgeMask;
-  bool computeDistances;
-  bool computePredecessors;
-  rmm::device_vector<IndexType> distances_vals;
-  IndexType* distances    = nullptr;
-  IndexType* predecessors = nullptr;
-  double* sp_counters     = nullptr;
-  int* edge_mask          = nullptr;
-
-  rmm::device_vector<IndexType> original_frontier;
-  rmm::device_vector<int> visited_bmap;
-  rmm::device_vector<int> isolated_bmap;
-  rmm::device_vector<int> previous_visited_bmap;
-  rmm::device_vector<IndexType> vertex_degree;
-  rmm::device_vector<IndexType> buffer_np1_1;
-  rmm::device_vector<IndexType> buffer_np1_2;
-  rmm::device_vector<IndexType> exclusive_sum_frontier_vertex_buckets_offsets;
-  rmm::device_vector<IndexType> d_counters_pad;
-  // Working data
-  // For complete description of each, go to bfs.cu
-  IndexType nisolated;
-  IndexType* frontier                             = nullptr;
-  IndexType* new_frontier                         = nullptr;
-  IndexType* frontier_vertex_degree               = nullptr;
-  IndexType* exclusive_sum_frontier_vertex_degree = nullptr;
-  IndexType* unvisited_queue                      = nullptr;
-  IndexType* left_unvisited_queue                 = nullptr;
-  IndexType* d_new_frontier_cnt                   = nullptr;
-  IndexType* d_mu                                 = nullptr;
-  IndexType* d_unvisited_cnt                      = nullptr;
-  IndexType* d_left_unvisited_cnt                 = nullptr;
-
-  IndexType vertices_bmap_size;
-
-  // Parameters for direction optimizing
-  IndexType alpha, beta;
-  cudaStream_t stream;
-
-  // resets pointers defined by d_counters_pad (see implem)
-  void resetDevicePointers();
-  void setup();
-  void clean();
-
- public:
-  virtual ~BFS(void) { clean(); }
-
-  BFS(IndexType _number_of_vertices,
-      IndexType _number_of_edges,
-      const IndexType* _row_offsets,
-      const IndexType* _col_indices,
-      bool _directed,
-      IndexType _alpha,
-      IndexType _beta,
-      cudaStream_t _stream = 0)
-    : number_of_vertices(_number_of_vertices),
-      number_of_edges(_number_of_edges),
-      row_offsets(_row_offsets),
-      col_indices(_col_indices),
-      directed(_directed),
-      alpha(_alpha),
-      beta(_beta),
-      stream(_stream)
-  {
-    setup();
-  }
-
-  void configure(IndexType* distances,
-                 IndexType* predecessors,
-                 double* sp_counters,
-                 int* edge_mask);
-
-  void traverse(IndexType source_vertex);
-};
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/bfs_kernels.cuh b/cpp/src/traversal/legacy/bfs_kernels.cuh
deleted file mode 100644
index a0c49e9601a..00000000000
--- a/cpp/src/traversal/legacy/bfs_kernels.cuh
+++ /dev/null
@@ -1,1163 +0,0 @@
-/*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <iostream>
-
-#include <cub/cub.cuh>
-#include <raft/util/cudart_utils.hpp>
-
-#include "traversal_common.cuh"
-#include <cugraph/legacy/graph.hpp>
-
-namespace cugraph {
-namespace detail {
-namespace bfs_kernels {
-//
-//  -------------------------  Bottom up -------------------------
-//
-
-//
-// fill_unvisited_queue_kernel
-//
-// Finding unvisited vertices in the visited_bmap, and putting them in the queue
-// Vertices represented by the same int in the bitmap are adjacent in the queue,
-// and sorted For instance, the queue can look like this : 34 38 45 58 61 4 18
-// 24 29 71 84 85 90 Because they are represented by those ints in the bitmap :
-// [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
-
-// visited_bmap_nints = the visited_bmap is made of that number of ints
-
-template <typename IndexType>
-__global__ void fill_unvisited_queue_kernel(int* visited_bmap,
-                                            IndexType visited_bmap_nints,
-                                            IndexType n,
-                                            IndexType* unvisited,
-                                            IndexType* unvisited_cnt)
-{
-  typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
-  __shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-  // When filling the "unvisited" queue, we use "unvisited_cnt" to know where to
-  // write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
-  // We will actually do only one atomicAdd per block - we first do a scan, then
-  // call one atomicAdd, and store the common offset for the block in
-  // unvisited_common_block_offset
-  __shared__ IndexType unvisited_common_block_offset;
-
-  // We don't want threads divergence in the loop (we're going to call
-  // __syncthreads) Using a block-only dependent in the condition of the loop
-  for (IndexType block_v_idx = blockIdx.x * blockDim.x; block_v_idx < visited_bmap_nints;
-       block_v_idx += blockDim.x * gridDim.x) {
-    // Index of visited_bmap that this thread will compute
-    IndexType v_idx = block_v_idx + threadIdx.x;
-
-    int thread_visited_int = (v_idx < visited_bmap_nints)
-                               ? visited_bmap[v_idx]
-                               : (~0);  // will be neutral in the next lines
-                                        // (virtual vertices all visited)
-
-    // The last int can only be partially valid
-    // If we are indeed taking care of the last visited int in this thread,
-    // We need to first disable (ie set as "visited") the inactive bits
-    // (vertices >= n)
-    if (v_idx == (visited_bmap_nints - 1)) {
-      int active_bits   = n - (INT_SIZE * v_idx);
-      int inactive_bits = INT_SIZE - active_bits;
-      int mask          = traversal::getMaskNLeftmostBitSet(inactive_bits);
-      thread_visited_int |= mask;  // Setting inactive bits as visited
-    }
-
-    // Counting number of unvisited vertices represented by this int
-    int n_unvisited_in_int = __popc(~thread_visited_int);
-    int unvisited_thread_offset;
-
-    // We will need to write n_unvisited_in_int unvisited vertices to the
-    // unvisited queue We ask for that space when computing the block scan, that
-    // will tell where to write those vertices in the queue, using the common
-    // offset of the block (see below)
-    BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
-
-    // Last thread knows how many vertices will be written to the queue by this
-    // block Asking for that space in the queue using the global count, and
-    // saving the common offset
-    if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
-      IndexType total               = unvisited_thread_offset + n_unvisited_in_int;
-      unvisited_common_block_offset = traversal::atomicAdd(unvisited_cnt, total);
-    }
-
-    // syncthreads for two reasons :
-    // - we need to broadcast unvisited_common_block_offset
-    // - we will reuse scan_temp_storage (cf CUB doc)
-    __syncthreads();
-
-    IndexType current_unvisited_index = unvisited_common_block_offset + unvisited_thread_offset;
-    int nvertices_to_write            = n_unvisited_in_int;
-
-    // getNextZeroBit uses __ffs, which gives least significant bit set
-    // which means that as long as n_unvisited_in_int is valid,
-    // we will use valid bits
-
-    while (nvertices_to_write > 0) {
-      if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
-        typename traversal::vec_t<IndexType>::vec4 vec_v;
-
-        vec_v.x = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.y = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.z = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.w = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-
-        typename traversal::vec_t<IndexType>::vec4* unvisited_i4 =
-          reinterpret_cast<typename traversal::vec_t<IndexType>::vec4*>(
-            &unvisited[current_unvisited_index]);
-        *unvisited_i4 = vec_v;
-
-        current_unvisited_index += 4;
-        nvertices_to_write -= 4;
-      } else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
-        typename traversal::vec_t<IndexType>::vec2 vec_v;
-
-        vec_v.x = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.y = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-
-        typename traversal::vec_t<IndexType>::vec2* unvisited_i2 =
-          reinterpret_cast<typename traversal::vec_t<IndexType>::vec2*>(
-            &unvisited[current_unvisited_index]);
-        *unvisited_i2 = vec_v;
-
-        current_unvisited_index += 2;
-        nvertices_to_write -= 2;
-      } else {
-        IndexType v = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-
-        unvisited[current_unvisited_index] = v;
-
-        current_unvisited_index += 1;
-        nvertices_to_write -= 1;
-      }
-    }
-  }
-}
-
-// Wrapper
-template <typename IndexType>
-void fill_unvisited_queue(int* visited_bmap,
-                          IndexType visited_bmap_nints,
-                          IndexType n,
-                          IndexType* unvisited,
-                          IndexType* unvisited_cnt,
-                          cudaStream_t m_stream,
-                          bool deterministic)
-{
-  dim3 grid, block;
-  block.x = FILL_UNVISITED_QUEUE_DIMX;
-
-  grid.x = std::min(static_cast<size_t>(MAXBLOCKS),
-                    (static_cast<size_t>(visited_bmap_nints) + block.x - 1) / block.x);
-
-  fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(
-    visited_bmap, visited_bmap_nints, n, unvisited, unvisited_cnt);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// count_unvisited_edges_kernel
-// Couting the total number of unvisited edges in the graph - using an
-// potentially unvisited queue We need the current unvisited vertices to be in
-// the unvisited queue But visited vertices can be in the potentially_unvisited
-// queue We first check if the vertex is still unvisited before using it Useful
-// when switching from "Bottom up" to "Top down"
-//
-
-template <typename IndexType>
-__global__ void count_unvisited_edges_kernel(const IndexType* potentially_unvisited,
-                                             const IndexType potentially_unvisited_size,
-                                             const int* visited_bmap,
-                                             IndexType* degree_vertices,
-                                             IndexType* mu)
-{
-  typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage reduce_temp_storage;
-
-  // number of undiscovered edges counted by this thread
-  IndexType thread_unvisited_edges_count = 0;
-
-  for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; idx < potentially_unvisited_size;
-       idx += blockDim.x * gridDim.x) {
-    IndexType u        = potentially_unvisited[idx];
-    int u_visited_bmap = visited_bmap[u / INT_SIZE];
-    int is_visited     = u_visited_bmap & (1 << (u % INT_SIZE));
-
-    if (!is_visited) thread_unvisited_edges_count += degree_vertices[u];
-  }
-
-  // We need all thread_unvisited_edges_count to be ready before reducing
-  __syncthreads();
-
-  IndexType block_unvisited_edges_count =
-    BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
-
-  // block_unvisited_edges_count is only defined is th.x == 0
-  if (threadIdx.x == 0) traversal::atomicAdd(mu, block_unvisited_edges_count);
-}
-
-// Wrapper
-template <typename IndexType>
-void count_unvisited_edges(const IndexType* potentially_unvisited,
-                           const IndexType potentially_unvisited_size,
-                           const int* visited_bmap,
-                           IndexType* node_degree,
-                           IndexType* mu,
-                           cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = COUNT_UNVISITED_EDGES_DIMX;
-  grid.x  = std::min(static_cast<size_t>(MAXBLOCKS),
-                    (static_cast<size_t>(potentially_unvisited_size) + block.x - 1) / block.x);
-
-  count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(
-    potentially_unvisited, potentially_unvisited_size, visited_bmap, node_degree, mu);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// Main Bottom Up kernel
-// Here we will start to process unvisited vertices in the unvisited queue
-// We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
-// If it's not possible to define a valid parent using only those edges,
-// add it to the "left_unvisited_queue"
-//
-
-//
-// We will use the "vertices represented by the same int in the visited bmap are
-// adjacents and sorted in the unvisited queue" property It is used to do a
-// reduction locally and fully build the new visited_bmap
-//
-
-template <typename IndexType>
-__global__ void main_bottomup_kernel(const IndexType* unvisited,
-                                     const IndexType unvisited_size,
-                                     IndexType* left_unvisited,
-                                     IndexType* left_unvisited_cnt,
-                                     int* visited_bmap,
-                                     const IndexType* row_ptr,
-                                     const IndexType* col_ind,
-                                     IndexType lvl,
-                                     IndexType* new_frontier,
-                                     IndexType* new_frontier_cnt,
-                                     IndexType* distances,
-                                     IndexType* predecessors,
-                                     int* edge_mask)
-{
-  typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
-  typedef cub::WarpReduce<int> WarpReduce;
-  typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
-
-  __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
-  __shared__ typename WarpReduce::TempStorage reduce_temp_storage;
-  __shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-  // To write vertices in the frontier,
-  // We will use a block scan to locally compute the offsets
-  // frontier_common_block_offset contains the common offset for the block
-  __shared__ IndexType frontier_common_block_offset;
-
-  // When building the new visited_bmap, we reduce (using a bitwise and) the
-  // visited_bmap ints from the vertices represented by the same int (for
-  // instance vertices 1, 5, 9, 13, 23) vertices represented by the same int
-  // will be designed as part of the same "group" To detect the deliminations
-  // between those groups, we use BlockDiscontinuity Then we need to create the
-  // new "visited_bmap" within those group. We use a warp reduction that takes
-  // into account limits between groups to do it But a group can be cut in two
-  // different warps : in that case, the second warp put the result of its local
-  // reduction in local_visited_bmap_warp_head the first warp will then read it
-  // and finish the reduction
-
-  __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
-
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
-
-  // When this kernel is converted to support different VT and ET, this
-  // will likely split into invalid_vid and invalid_eid
-  // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
-
-  // we will call __syncthreads inside the loop
-  // we need to keep complete block active
-  for (IndexType block_off = blockIdx.x * blockDim.x; block_off < unvisited_size;
-       block_off += blockDim.x * gridDim.x) {
-    IndexType idx = block_off + threadIdx.x;
-
-    // This thread will take care of unvisited_vertex
-    // in the visited_bmap, it is represented by the int at index
-    // visited_bmap_index = unvisited_vertex/INT_SIZE
-    // it will be used by BlockDiscontinuity
-    // to flag the separation between groups of vertices (vertices represented
-    // by different in in visited_bmap)
-    IndexType visited_bmap_index[1];  // this is an array of size 1 because CUB
-                                      // needs one
-
-    visited_bmap_index[0]      = invalid_idx;
-    IndexType unvisited_vertex = invalid_idx;
-
-    // local_visited_bmap gives info on the visited bit of unvisited_vertex
-    //
-    // By default, everything is visited
-    // This is because we only take care of unvisited vertices here,
-    // The other are by default unvisited
-    // If a vertex remain unvisited, we will notice it here
-    // That's why by default we consider everything visited ( ie ~0 )
-    // If we fail to assign one parent to an unvisited vertex, we will
-    // explicitly unset the bit
-    int local_visited_bmap = (~0);
-    int found              = 0;
-    int more_to_visit      = 0;
-    IndexType valid_parent;
-    IndexType left_unvisited_off;
-
-    if (idx < unvisited_size) {
-      // Processing first STPV edges of unvisited v
-      // If bigger than that, push to left_unvisited queue
-      unvisited_vertex = unvisited[idx];
-
-      IndexType edge_begin = row_ptr[unvisited_vertex];
-      IndexType edge_end   = row_ptr[unvisited_vertex + 1];
-
-      visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
-
-      IndexType degree = edge_end - edge_begin;
-
-      for (IndexType edge = edge_begin;
-           edge < min(static_cast<size_t>(edge_end),
-                      static_cast<size_t>(edge_begin) + MAIN_BOTTOMUP_MAX_EDGES);
-           ++edge) {
-        if (edge_mask && !edge_mask[edge]) continue;
-
-        IndexType parent_candidate = col_ind[edge];
-
-        if (distances[parent_candidate] == (lvl - 1)) {
-          found        = 1;
-          valid_parent = parent_candidate;
-          break;
-        }
-      }
-
-      // This vertex will remain unvisited at the end of this kernel
-      // Explicitly say it
-      if (!found)
-        local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE));  // let this one unvisited
-      else {
-        if (distances) distances[unvisited_vertex] = lvl;
-        if (predecessors) predecessors[unvisited_vertex] = valid_parent;
-      }
-
-      // If we haven't found a parent and there's more edge to check
-      if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) {
-        left_unvisited_off = traversal::atomicAdd(left_unvisited_cnt, static_cast<IndexType>(1));
-        more_to_visit      = 1;
-      }
-    }
-
-    //
-    // We will separate vertices in group
-    // Two vertices are in the same group if represented by same int in
-    // visited_bmap ie u and v in same group <=> u/32 == v/32
-    //
-    // We will now flag the head of those group (first element of each group)
-    //
-    // 1) All vertices within the same group are adjacent in the queue (cf
-    // fill_unvisited_queue) 2) A group is of size <= 32, so a warp will contain
-    // at least one head, and a group will be contained at most by two warps
-
-    int is_head_a[1];  // CUB need an array
-    BlockDiscontinuity(discontinuity_temp_storage)
-      .FlagHeads(is_head_a, visited_bmap_index, cub::Inequality());
-    int is_head = is_head_a[0];
-
-    // Computing the warp reduce within group
-    // This primitive uses the is_head flags to know where the limits of the
-    // groups are We use bitwise and as operator, because of the fact that 1 is
-    // the default value If a vertex is unvisited, we have to explicitly ask for
-    // it
-    int local_bmap_agg =
-      WarpReduce(reduce_temp_storage)
-        .HeadSegmentedReduce(local_visited_bmap, is_head, traversal::BitwiseAnd());
-
-    // We need to take care of the groups cut in two in two different warps
-    // Saving second part of the reduce here, then applying it on the first part
-    // bellow Corner case : if the first thread of the warp is a head, then this
-    // group is not cut in two and then we have to be neutral (for an bitwise
-    // and, it's an ~0)
-    if (laneid == 0) { local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg; }
-
-    // broadcasting local_visited_bmap_warp_head
-    __syncthreads();
-
-    int head_ballot = __ballot_sync(raft::warp_full_mask(), is_head);
-
-    // As long as idx < unvisited_size, we know there's at least one head per
-    // warp
-    int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
-
-    int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
-
-    // if laneid == 0 && is_last_head_in_warp, it's a special case where
-    // a group of size 32 starts exactly at lane 0
-    // in that case, nothing to do (this group is not cut by a warp
-    // delimitation) we also have to make sure that a warp actually exists after
-    // this one (this corner case is handled after)
-    if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)) {
-      local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
-    }
-
-    // Three cases :
-    // -> This is the first group of the block - it may be cut in two (with
-    // previous block)
-    // -> This is the last group of the block - same thing
-    // -> This group is completely contained in this block
-
-    if (warpid == 0 && laneid == 0) {
-      // The first elt of this group considered in this block is
-      // unvisited_vertex We know that's the case because elts are sorted in a
-      // group, and we are at laneid == 0 We will do an atomicOr - we have to be
-      // neutral about elts < unvisited_vertex
-      int iv   = unvisited_vertex % INT_SIZE;  // we know that this unvisited_vertex is valid
-      int mask = traversal::getMaskNLeftmostBitSet(INT_SIZE - iv);
-      local_bmap_agg &= mask;                  // we have to be neutral for elts < unvisited_vertex
-      atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-    } else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
-               laneid >= laneid_last_head_in_warp &&  // We need the other ones
-                                                      // to go in else case
-               idx < unvisited_size                   // we could be out
-    ) {
-      // Last head of the block
-      // We don't know if this group is complete
-
-      // last_v is the last unvisited_vertex of the group IN THIS block
-      // we dont know about the rest - we have to be neutral about elts > last_v
-
-      // the destination thread of the __shfl is active
-      int laneid_max =
-        min(static_cast<IndexType>(WARP_SIZE - 1), (unvisited_size - (block_off + 32 * warpid)));
-      IndexType last_v = __shfl_sync(__activemask(), unvisited_vertex, laneid_max, WARP_SIZE);
-
-      if (is_last_head_in_warp) {
-        int ilast_v = last_v % INT_SIZE + 1;
-        int mask    = traversal::getMaskNRightmostBitSet(ilast_v);
-        local_bmap_agg &= mask;  // we have to be neutral for elts > last_unvisited_vertex
-        atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-      }
-    } else {
-      // group completely in block
-      if (is_head && idx < unvisited_size) {
-        visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg;  // no atomics needed, we know
-                                                                     // everything about this int
-      }
-    }
-
-    // Saving in frontier
-
-    int thread_frontier_offset;
-    BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
-    IndexType inclusive_sum = thread_frontier_offset + found;
-    if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) {
-      frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum);
-    }
-
-    // 1) Broadcasting frontier_common_block_offset
-    // 2) we want to reuse the *_temp_storage
-    __syncthreads();
-
-    if (found)
-      new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
-    if (more_to_visit) left_unvisited[left_unvisited_off] = unvisited_vertex;
-  }
-}
-
-template <typename IndexType>
-void bottom_up_main(IndexType* unvisited,
-                    IndexType unvisited_size,
-                    IndexType* left_unvisited,
-                    IndexType* d_left_unvisited_idx,
-                    int* visited,
-                    const IndexType* row_ptr,
-                    const IndexType* col_ind,
-                    IndexType lvl,
-                    IndexType* new_frontier,
-                    IndexType* new_frontier_idx,
-                    IndexType* distances,
-                    IndexType* predecessors,
-                    int* edge_mask,
-                    cudaStream_t m_stream,
-                    bool deterministic)
-{
-  dim3 grid, block;
-  block.x = MAIN_BOTTOMUP_DIMX;
-
-  grid.x = std::min(static_cast<size_t>(MAXBLOCKS),
-                    (static_cast<size_t>(unvisited_size) + block.x) / block.x);
-
-  main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
-                                                     unvisited_size,
-                                                     left_unvisited,
-                                                     d_left_unvisited_idx,
-                                                     visited,
-                                                     row_ptr,
-                                                     col_ind,
-                                                     lvl,
-                                                     new_frontier,
-                                                     new_frontier_idx,
-                                                     distances,
-                                                     predecessors,
-                                                     edge_mask);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// bottom_up_large_degree_kernel
-// finishing the work started in main_bottomup_kernel for vertex with degree >
-// MAIN_BOTTOMUP_MAX_EDGES && no parent found
-//
-template <typename IndexType>
-__global__ void bottom_up_large_degree_kernel(IndexType* left_unvisited,
-                                              IndexType left_unvisited_size,
-                                              int* visited,
-                                              const IndexType* row_ptr,
-                                              const IndexType* col_ind,
-                                              IndexType lvl,
-                                              IndexType* new_frontier,
-                                              IndexType* new_frontier_cnt,
-                                              IndexType* distances,
-                                              IndexType* predecessors,
-                                              int* edge_mask)
-{
-  int logical_lane_id         = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
-  int logical_warp_id         = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-  int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-
-  // When this kernel is converted to support different VT and ET, this
-  // will likely split into invalid_vid and invalid_eid
-  // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
-
-  // Inactive threads are not a pb for __ballot (known behaviour)
-  for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
-       idx < left_unvisited_size;
-       idx += gridDim.x * logical_warps_per_block) {
-    // Unvisited vertices - potentially in the next frontier
-    IndexType v = left_unvisited[idx];
-
-    // Used only with symmetric graphs
-    // Parents are included in v's neighbors
-    IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES;  // we already have checked the
-                                                                    // first MAIN_BOTTOMUP_MAX_EDGES
-                                                                    // edges in find_unvisited
-
-    IndexType end_i_edge = row_ptr[v + 1];
-
-    // We can have warp divergence in the next loop
-    // It's not a pb because the behaviour of __ballot
-    // is know with inactive threads
-    for (IndexType i_edge = first_i_edge + logical_lane_id; i_edge < end_i_edge;
-         i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
-      IndexType valid_parent = invalid_idx;
-
-      if (!edge_mask || edge_mask[i_edge]) {
-        IndexType u     = col_ind[i_edge];
-        IndexType lvl_u = distances[u];
-
-        if (lvl_u == (lvl - 1)) { valid_parent = u; }
-      }
-
-      unsigned int warp_valid_p_ballot =
-        __ballot_sync(raft::warp_full_mask(), valid_parent != invalid_idx);
-
-      int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
-      unsigned int mask           = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
-      unsigned int logical_warp_valid_p_ballot =
-        warp_valid_p_ballot >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
-      logical_warp_valid_p_ballot &= mask;
-
-      int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
-
-      if (chosen_thread == logical_lane_id) {
-        // Using only one valid parent (reduce bw)
-        IndexType off = traversal::atomicAdd(new_frontier_cnt, static_cast<IndexType>(1));
-        int m         = 1 << (v % INT_SIZE);
-        atomicOr(&visited[v / INT_SIZE], m);
-        distances[v] = lvl;
-
-        if (predecessors) predecessors[v] = valid_parent;
-
-        new_frontier[off] = v;
-      }
-
-      if (logical_warp_valid_p_ballot) { break; }
-    }
-  }
-}
-
-template <typename IndexType>
-void bottom_up_large(IndexType* left_unvisited,
-                     IndexType left_unvisited_size,
-                     int* visited,
-                     const IndexType* row_ptr,
-                     const IndexType* col_ind,
-                     IndexType lvl,
-                     IndexType* new_frontier,
-                     IndexType* new_frontier_idx,
-                     IndexType* distances,
-                     IndexType* predecessors,
-                     int* edge_mask,
-                     cudaStream_t m_stream,
-                     bool deterministic)
-{
-  dim3 grid, block;
-  block.x = LARGE_BOTTOMUP_DIMX;
-  grid.x  = std::min(
-    static_cast<size_t>(MAXBLOCKS),
-    ((static_cast<size_t>(left_unvisited_size) + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) /
-      block.x);
-
-  bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
-                                                              left_unvisited_size,
-                                                              visited,
-                                                              row_ptr,
-                                                              col_ind,
-                                                              lvl,
-                                                              new_frontier,
-                                                              new_frontier_idx,
-                                                              distances,
-                                                              predecessors,
-                                                              edge_mask);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// topdown_expand_kernel
-// Read current frontier and compute new one with top down paradigm
-// One thread = One edge
-// To know origin of edge, we have to find where is index_edge in the values of
-// frontier_degrees_exclusive_sum (using a binary search, max less or equal
-// than) This index k will give us the origin of this edge, which is frontier[k]
-// This thread will then process the (linear_idx_thread -
-// frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
-//
-// To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load
-// NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
-// We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we
-// prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD
-// * blockDim.x edges
-//
-// Once we have those offsets, we may still need a few values from
-// frontier_degrees_exclusive_sum to compute exact index k To be able to do it,
-// we will load the values that we need from frontier_degrees_exclusive_sum in
-// shared memory We know that it will fit because we never add node with degree
-// == 0 in the frontier, so we have an upper bound on the number of value to
-// load (see below)
-//
-// We will then look which vertices are not visited yet :
-// 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as
-// visited, update distances and predecessors, and move on 2) if the unvisited
-// vertex has degree > 0, we add it to the "frontier_candidates" queue
-//
-// We then treat the candidates queue using the threadIdx.x < ncandidates
-// If we are indeed the first thread to discover that vertex (result of
-// atomicOr(visited)) We add it to the new frontier
-//
-
-template <typename IndexType>
-__global__ void topdown_expand_kernel(
-  const IndexType* row_ptr,
-  const IndexType* col_ind,
-  const IndexType* frontier,
-  const IndexType frontier_size,
-  const IndexType totaldegree,
-  const IndexType max_items_per_thread,
-  const IndexType lvl,
-  IndexType* new_frontier,
-  IndexType* new_frontier_cnt,
-  const IndexType* frontier_degrees_exclusive_sum,
-  const IndexType* frontier_degrees_exclusive_sum_buckets_offsets,
-  int* previous_bmap,
-  int* bmap,
-  IndexType* distances,
-  IndexType* predecessors,
-  double* sp_counters,
-  const int* edge_mask,
-  const int* isolated_bmap,
-  bool directed)
-{
-  // BlockScan
-  typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
-  __shared__ typename BlockScan::TempStorage scan_storage;
-
-  // We will do a scan to know where to write in frontier
-  // This will contain the common offset of the block
-  __shared__ IndexType frontier_common_block_offset;
-
-  __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
-  __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
-
-  //
-  // Frontier candidates local queue
-  // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able
-  // to store everything We also save the predecessors here, because we will not
-  // be able to retrieve it after
-  //
-  __shared__ IndexType
-    shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE * TOP_DOWN_EXPAND_DIMX];
-  __shared__ IndexType
-    shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE * TOP_DOWN_EXPAND_DIMX];
-  __shared__ IndexType block_n_frontier_candidates;
-
-  IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
-
-  // When this kernel is converted to support different VT and ET, this
-  // will likely split into invalid_vid and invalid_eid
-  // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
-
-  IndexType n_items_per_thread_left =
-    (totaldegree > block_offset)
-      ? (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) / TOP_DOWN_EXPAND_DIMX
-      : 0;
-
-  n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
-
-  for (; (n_items_per_thread_left > 0) && (block_offset < totaldegree);
-
-       block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
-       n_items_per_thread_left -= min(
-         n_items_per_thread_left, static_cast<IndexType>(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD))) {
-    // In this loop, we will process batch_set_size batches
-    IndexType nitems_per_thread =
-      min(n_items_per_thread_left, static_cast<IndexType>(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD));
-
-    // Loading buckets offset (see compute_bucket_offsets_kernel)
-
-    if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
-      shared_buckets_offsets[threadIdx.x] =
-        frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE +
-                                                       threadIdx.x];
-
-    // We will use shared_buckets_offsets
-    __syncthreads();
-
-    //
-    // shared_buckets_offsets gives us a range of the possible indexes
-    // for edge of linear_threadx, we are looking for the value k such as
-    // k is the max value such as frontier_degrees_exclusive_sum[k] <=
-    // linear_threadx
-    //
-    // we have 0 <= k < frontier_size
-    // but we also have :
-    //
-    // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
-    // <= k
-    // <=
-    // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE
-    // + 1]
-    //
-    // To find the exact value in that range, we need a few values from
-    // frontier_degrees_exclusive_sum (see below) We will load them here We will
-    // load as much as we can - if it doesn't fit we will make multiple
-    // iteration of the next loop Because all vertices in frontier have degree >
-    // 0, we know it will fits if left + 1 = right (see below)
-
-    // We're going to load values in frontier_degrees_exclusive_sum for batch
-    // [left; right[ If it doesn't fit, --right until it does, then loop It is
-    // excepted to fit on the first try, that's why we start right =
-    // nitems_per_thread
-
-    IndexType left  = 0;
-    IndexType right = nitems_per_thread;
-
-    while (left < nitems_per_thread) {
-      //
-      // Values that are necessary to compute the local binary searches
-      // We only need those with indexes between extremes indexes of
-      // buckets_offsets We need the next val for the binary search, hence the
-      // +1
-      //
-
-      IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] -
-                                  shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-
-      // If left = right + 1 we are sure to have nvalues_to_load <
-      // TOP_DOWN_EXPAND_DIMX+1
-      while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
-        --right;
-
-        nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] -
-                          shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-      }
-
-      IndexType nitems_per_thread_for_this_load = right - left;
-
-      IndexType frontier_degrees_exclusive_sum_block_offset =
-        shared_buckets_offsets[left * NBUCKETS_PER_BLOCK];
-
-      if (threadIdx.x < nvalues_to_load) {
-        shared_frontier_degrees_exclusive_sum[threadIdx.x] =
-          frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + threadIdx.x];
-      }
-
-      if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
-        shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
-          frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset +
-                                         TOP_DOWN_EXPAND_DIMX];
-      }
-
-      // shared_frontier_degrees_exclusive_sum is in shared mem, we will use it,
-      // sync
-      __syncthreads();
-
-      // Now we will process the edges
-      // Here each thread will process nitems_per_thread_for_this_load
-      for (IndexType item_index = 0; item_index < nitems_per_thread_for_this_load;
-           item_index += TOP_DOWN_BATCH_SIZE) {
-        // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction
-        // parallism) Reduces latency
-
-        IndexType current_max_edge_index = min(
-          static_cast<size_t>(block_offset) + (left + nitems_per_thread_for_this_load) * blockDim.x,
-          static_cast<size_t>(totaldegree));
-
-        // We will need vec_u (source of the edge) until the end if we need to
-        // save the predecessors For others informations, we will reuse pointers
-        // on the go (nvcc does not color well the registers in that case)
-
-        IndexType vec_u[TOP_DOWN_BATCH_SIZE];
-        IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
-        IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
-
-        IndexType* vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
-
-#pragma unroll
-        for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType ibatch = left + item_index + iv;
-          IndexType gid    = block_offset + ibatch * blockDim.x + threadIdx.x;
-
-          if (gid < current_max_edge_index) {
-            IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x) / TOP_DOWN_BUCKET_SIZE;
-            IndexType bucket_start =
-              shared_buckets_offsets[start_off_idx] - frontier_degrees_exclusive_sum_block_offset;
-            IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1] -
-                                   frontier_degrees_exclusive_sum_block_offset;
-
-            IndexType k = traversal::binsearch_maxle(
-                            shared_frontier_degrees_exclusive_sum, gid, bucket_start, bucket_end) +
-                          frontier_degrees_exclusive_sum_block_offset;
-            vec_u[iv]                                    = frontier[k];  // origin of this edge
-            vec_frontier_degrees_exclusive_sum_index[iv] = frontier_degrees_exclusive_sum[k];
-          } else {
-            vec_u[iv]                                    = invalid_idx;
-            vec_frontier_degrees_exclusive_sum_index[iv] = invalid_idx;
-          }
-        }
-
-        IndexType* vec_row_ptr_u = &local_buf1[0];
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType u = vec_u[iv];
-          // row_ptr for this vertex origin u
-          vec_row_ptr_u[iv] = (u != invalid_idx) ? row_ptr[u] : invalid_idx;
-        }
-
-        // We won't need row_ptr after that, reusing pointer
-        IndexType* vec_dest_v = vec_row_ptr_u;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType thread_item_index = left + item_index + iv;
-          IndexType gid               = block_offset + thread_item_index * blockDim.x + threadIdx.x;
-
-          IndexType row_ptr_u = vec_row_ptr_u[iv];
-          // Need this check so that we don't use invalid values of edge to index
-          if (row_ptr_u != invalid_idx) {
-            IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
-
-            if (edge_mask && !edge_mask[edge]) {
-              // Disabling edge
-              row_ptr_u = invalid_idx;
-            } else {
-              // Destination of this edge
-              vec_dest_v[iv] = col_ind[edge];
-            }
-          }
-        }
-
-        // We don't need vec_frontier_degrees_exclusive_sum_index anymore
-        IndexType* vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
-
-        // Visited bmap need to contain information about the previous
-        // frontier if we actually process every edge (shortest path counting)
-        // otherwise we can read and update from the same bmap
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType v = vec_dest_v[iv];
-          vec_v_visited_bmap[iv] =
-            (v != invalid_idx) ? previous_bmap[v / INT_SIZE] : (~int(0));  // will look visited
-        }
-
-        // From now on we will consider v as a frontier candidate
-        // If for some reason vec_candidate[iv] should be put in the
-        // new_frontier Then set vec_candidate[iv] = -1
-        IndexType* vec_frontier_candidate = vec_dest_v;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType v = vec_frontier_candidate[iv];
-          int m       = 1 << (v % INT_SIZE);
-
-          int is_visited = vec_v_visited_bmap[iv] & m;
-
-          if (is_visited) vec_frontier_candidate[iv] = invalid_idx;
-        }
-
-        // Each source should update the destination shortest path counter
-        // if the destination has not been visited in the *previous* frontier
-        if (sp_counters) {
-#pragma unroll
-          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-            IndexType dst = vec_frontier_candidate[iv];
-            if (dst != invalid_idx) {
-              IndexType src = vec_u[iv];
-              atomicAdd(&sp_counters[dst], sp_counters[src]);
-            }
-          }
-        }
-
-        if (directed) {
-          // vec_v_visited_bmap is available
-          IndexType* vec_is_isolated_bmap = vec_v_visited_bmap;
-
-#pragma unroll
-          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-            IndexType v              = vec_frontier_candidate[iv];
-            vec_is_isolated_bmap[iv] = (v != invalid_idx) ? isolated_bmap[v / INT_SIZE] : ~int(0);
-          }
-
-#pragma unroll
-          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-            IndexType v     = vec_frontier_candidate[iv];
-            int m           = 1 << (v % INT_SIZE);
-            int is_isolated = vec_is_isolated_bmap[iv] & m;
-
-            // If v is isolated, we will not add it to the frontier (it's not a
-            // frontier candidate) 1st reason : it's useless 2nd reason : it
-            // will make top down algo fail we need each node in frontier to
-            // have a degree > 0 If it is isolated, we just need to mark it as
-            // visited, and save distance and predecessor here. Not need to
-            // check return value of atomicOr
-
-            if (is_isolated && v != invalid_idx) {
-              int m = 1 << (v % INT_SIZE);
-              atomicOr(&bmap[v / INT_SIZE], m);
-              if (distances) distances[v] = lvl;
-
-              if (predecessors) predecessors[v] = vec_u[iv];
-
-              // This is no longer a candidate, neutralize it
-              vec_frontier_candidate[iv] = invalid_idx;
-            }
-          }
-        }
-
-        // Number of successor candidate hold by this thread
-        IndexType thread_n_frontier_candidates = 0;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType v = vec_frontier_candidate[iv];
-          if (v != invalid_idx) ++thread_n_frontier_candidates;
-        }
-
-        // We need to have all nfrontier_candidates to be ready before doing the
-        // scan
-        __syncthreads();
-
-        // We will put the frontier candidates in a local queue
-        // Computing offsets
-        IndexType thread_frontier_candidate_offset = 0;  // offset inside block
-        BlockScan(scan_storage)
-          .ExclusiveSum(thread_n_frontier_candidates, thread_frontier_candidate_offset);
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          // May have bank conflicts
-          IndexType frontier_candidate = vec_frontier_candidate[iv];
-
-          if (frontier_candidate != invalid_idx) {
-            shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
-              frontier_candidate;
-            shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = vec_u[iv];
-            ++thread_frontier_candidate_offset;
-          }
-        }
-
-        if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
-          // No need to add nsuccessor_candidate, even if its an
-          // exclusive sum
-          // We incremented the thread_frontier_candidate_offset
-          block_n_frontier_candidates = thread_frontier_candidate_offset;
-        }
-
-        // broadcast block_n_frontier_candidates
-        __syncthreads();
-
-        IndexType naccepted_vertices = 0;
-        // We won't need vec_frontier_candidate after that
-        IndexType* vec_frontier_accepted_vertex = vec_frontier_candidate;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          const int idx_shared             = iv * blockDim.x + threadIdx.x;
-          vec_frontier_accepted_vertex[iv] = invalid_idx;
-
-          if (idx_shared < block_n_frontier_candidates) {
-            IndexType v = shared_local_new_frontier_candidates[idx_shared];  // popping
-                                                                             // queue
-            int m = 1 << (v % INT_SIZE);
-            int q = atomicOr(&bmap[v / INT_SIZE], m);                        // atomicOr returns old
-
-            if (!(m & q)) {  // if this thread was the first to discover this node
-              if (distances) distances[v] = lvl;
-
-              if (predecessors) {
-                IndexType pred  = shared_local_new_frontier_predecessors[idx_shared];
-                predecessors[v] = pred;
-              }
-
-              vec_frontier_accepted_vertex[iv] = v;
-              ++naccepted_vertices;
-            }
-          }
-        }
-
-        // We need naccepted_vertices to be ready
-        __syncthreads();
-
-        IndexType thread_new_frontier_offset;
-
-        BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
-
-        if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
-          IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
-          // for this thread, thread_new_frontier_offset + has_successor
-          // (exclusive sum)
-          if (inclusive_sum)
-            frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum);
-        }
-
-        // Broadcasting frontier_common_block_offset
-        __syncthreads();
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          const int idx_shared = iv * blockDim.x + threadIdx.x;
-          if (idx_shared < block_n_frontier_candidates) {
-            IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
-
-            if (new_frontier_vertex != invalid_idx) {
-              IndexType off     = frontier_common_block_offset + thread_new_frontier_offset++;
-              new_frontier[off] = new_frontier_vertex;
-            }
-          }
-        }
-      }
-
-      // We need to keep shared_frontier_degrees_exclusive_sum coherent
-      __syncthreads();
-
-      // Preparing for next load
-      left  = right;
-      right = nitems_per_thread;
-    }
-
-    // we need to keep shared_buckets_offsets coherent
-    __syncthreads();
-  }
-}
-
-template <typename IndexType>
-void frontier_expand(const IndexType* row_ptr,
-                     const IndexType* col_ind,
-                     const IndexType* frontier,
-                     const IndexType frontier_size,
-                     const IndexType totaldegree,
-                     const IndexType lvl,
-                     IndexType* new_frontier,
-                     IndexType* new_frontier_cnt,
-                     const IndexType* frontier_degrees_exclusive_sum,
-                     const IndexType* frontier_degrees_exclusive_sum_buckets_offsets,
-                     int* previous_visited_bmap,
-                     int* visited_bmap,
-                     IndexType* distances,
-                     IndexType* predecessors,
-                     double* sp_counters,
-                     const int* edge_mask,
-                     const int* isolated_bmap,
-                     bool directed,
-                     cudaStream_t m_stream,
-                     bool deterministic)
-{
-  if (!totaldegree) return;
-
-  dim3 block;
-  block.x = TOP_DOWN_EXPAND_DIMX;
-
-  IndexType max_items_per_thread =
-    (static_cast<size_t>(totaldegree) + MAXBLOCKS * block.x - 1) / (MAXBLOCKS * block.x);
-
-  dim3 grid;
-  grid.x = std::min((static_cast<size_t>(totaldegree) + max_items_per_thread * block.x - 1) /
-                      (max_items_per_thread * block.x),
-                    static_cast<size_t>(MAXBLOCKS));
-
-  // Shortest Path counting (Betweenness Centrality)
-  // We need to keep track of the previously visited bmap
-
-  // If the coutner of shortest path is nullptr
-  // The previous_visited_bmap is no longer needed (and should be nullptr on
-  // the first access), so it can be the same as the current visitedbmap
-  if (!sp_counters) { previous_visited_bmap = visited_bmap; }
-  topdown_expand_kernel<<<grid, block, 0, m_stream>>>(
-    row_ptr,
-    col_ind,
-    frontier,
-    frontier_size,
-    totaldegree,
-    max_items_per_thread,
-    lvl,
-    new_frontier,
-    new_frontier_cnt,
-    frontier_degrees_exclusive_sum,
-    frontier_degrees_exclusive_sum_buckets_offsets,
-    previous_visited_bmap,
-    visited_bmap,
-    distances,
-    predecessors,
-    sp_counters,
-    edge_mask,
-    isolated_bmap,
-    directed);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-}  // namespace bfs_kernels
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/traversal_common.cuh b/cpp/src/traversal/legacy/traversal_common.cuh
deleted file mode 100644
index fac80e90eb6..00000000000
--- a/cpp/src/traversal/legacy/traversal_common.cuh
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cub/cub.cuh>
-#include <cugraph/utilities/error.hpp>
-
-#include <thrust/scan.h>
-
-#define MAXBLOCKS 65535
-#define WARP_SIZE 32
-#define INT_SIZE  32
-
-//
-// Bottom up macros
-//
-
-#define FILL_UNVISITED_QUEUE_DIMX 256
-
-#define COUNT_UNVISITED_EDGES_DIMX 256
-
-#define MAIN_BOTTOMUP_DIMX   256
-#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX / WARP_SIZE)
-
-#define LARGE_BOTTOMUP_DIMX 256
-
-// Number of edges processed in the main bottom up kernel
-#define MAIN_BOTTOMUP_MAX_EDGES 6
-
-// Power of 2 < 32 (strict <)
-#define BOTTOM_UP_LOGICAL_WARP_SIZE 4
-
-//
-// Top down macros
-//
-
-// We will precompute the results the binsearch_maxle every
-// TOP_DOWN_BUCKET_SIZE edges
-#define TOP_DOWN_BUCKET_SIZE 32
-
-// DimX of the kernel
-#define TOP_DOWN_EXPAND_DIMX 256
-
-// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets
-#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX / TOP_DOWN_BUCKET_SIZE)
-
-// How many items_per_thread we can process with one bucket_offset loading
-// the -1 is here because we need the +1 offset
-#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
-
-// instruction parallelism
-// for how many edges will we create instruction parallelism
-#define TOP_DOWN_BATCH_SIZE 2
-
-#define COMPUTE_BUCKET_OFFSETS_DIMX 512
-
-// Other macros
-
-#define FLAG_ISOLATED_VERTICES_DIMX 128
-
-// Number of vertices handled by one thread
-// Must be power of 2, lower than 32
-#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4
-
-// Number of threads involved in the "construction" of one int in the bitset
-#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT \
-  (INT_SIZE / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD)
-
-//
-// Parameters of the heuristic to switch between bottomup/topdown
-// Finite machine described in
-// http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
-//
-
-namespace cugraph {
-namespace detail {
-namespace traversal {
-
-//
-// gives the equivalent vectors from a type
-// for the max val, would be better to use numeric_limits<>::max() once
-// cpp11 is allowed in nvgraph
-//
-
-template <typename>
-struct vec_t {
-  typedef int4 vec4;
-  typedef int2 vec2;
-};
-
-template <>
-struct vec_t<int> {
-  typedef int4 vec4;
-  typedef int2 vec2;
-  static const int max = std::numeric_limits<int>::max();
-};
-
-template <>
-struct vec_t<long> {
-  typedef long4 vec4;
-  typedef long2 vec2;
-  static const long max = std::numeric_limits<long>::max();
-};
-
-template <>
-struct vec_t<unsigned> {
-  typedef uint4 vec4;
-  typedef uint2 vec2;
-  static const unsigned max = std::numeric_limits<unsigned>::max();
-};
-
-template <>
-struct vec_t<long long int> {
-  typedef longlong4 vec4;
-  typedef longlong2 vec2;
-  static const long long int max = std::numeric_limits<long long int>::max();
-};
-
-template <>
-struct vec_t<float> {
-  typedef float4 vec4;
-  typedef float2 vec2;
-  static constexpr float max = std::numeric_limits<float>::max();
-};
-
-template <>
-struct vec_t<double> {
-  typedef double4 vec4;
-  typedef double2 vec2;
-  static constexpr double max = std::numeric_limits<double>::max();
-};
-
-//
-// ------------------------- Helper device functions -------------------
-//
-
-__forceinline__ __device__ int getMaskNRightmostBitSet(int n)
-{
-  if (n == INT_SIZE) return (~0);
-  int mask = (1 << n) - 1;
-  return mask;
-}
-
-__forceinline__ __device__ int getMaskNLeftmostBitSet(int n)
-{
-  if (n == 0) return 0;
-  int mask = ~((1 << (INT_SIZE - n)) - 1);
-  return mask;
-}
-
-__forceinline__ __device__ int getNextZeroBit(int& val)
-{
-  int ibit = __ffs(~val) - 1;
-  val |= (1 << ibit);
-
-  return ibit;
-}
-
-struct BitwiseAnd {
-  template <typename T>
-  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const
-  {
-    return (a & b);
-  }
-};
-struct BitwiseOr {
-  template <typename T>
-  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const
-  {
-    return (a | b);
-  }
-};
-
-template <typename ValueType, typename SizeType>
-__global__ void fill_vec_kernel(ValueType* vec, SizeType n, ValueType val)
-{
-  for (SizeType idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
-    vec[idx] = val;
-}
-
-template <typename ValueType, typename SizeType>
-void fill_vec(ValueType* vec, SizeType n, ValueType val, cudaStream_t stream)
-{
-  dim3 grid, block;
-  block.x = 256;
-  grid.x  = (n + block.x - 1) / block.x;
-
-  fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
-  RAFT_CHECK_CUDA(stream);
-}
-
-template <typename IndexType>
-__device__ IndexType
-binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexType high)
-{
-  while (true) {
-    if (low == high) return low;  // we know it exists
-    if ((low + 1) == high) return (vec[high] <= val) ? high : low;
-
-    IndexType mid = low + (high - low) / 2;
-
-    if (vec[mid] > val)
-      high = mid - 1;
-    else
-      low = mid;
-  }
-}
-
-// FIXME: The atomicAdd wrappers should be moved to RAFT
-
-template <typename T>
-__device__ static __forceinline__ T atomicAdd(T* addr, T val)
-{
-  return ::atomicAdd(addr, val);
-}
-
-template <>
-__device__ __forceinline__ int64_t atomicAdd<int64_t>(int64_t* addr, int64_t val)
-{
-  static_assert(sizeof(int64_t) == sizeof(unsigned long long),
-                "sizeof(int64_t) != sizeof(unsigned long long). Can't use atomicAdd");
-
-  return ::atomicAdd(reinterpret_cast<unsigned long long*>(addr),
-                     static_cast<unsigned long long>(val));
-}
-
-__device__ static __forceinline__ float atomicMin(float* addr, float val)
-{
-  int* addr_as_int = (int*)addr;
-  int old          = *addr_as_int;
-  int expected;
-  do {
-    expected = old;
-    old =
-      ::atomicCAS(addr_as_int, expected, __float_as_int(::fminf(val, __int_as_float(expected))));
-  } while (expected != old);
-  return __int_as_float(old);
-}
-
-__device__ static __forceinline__ double atomicMin(double* address, double val)
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old             = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old     = ::atomicCAS(
-      address_as_ull, assumed, __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
-
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
-    // NaN)
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-
-template <typename IndexType>
-__global__ void flag_isolated_vertices_kernel(IndexType n,
-                                              int* isolated_bmap,
-                                              const IndexType* row_ptr,
-                                              IndexType* degrees,
-                                              IndexType* nisolated)
-{
-  typedef cub::BlockLoad<IndexType,
-                         FLAG_ISOLATED_VERTICES_DIMX,
-                         FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD,
-                         cub::BLOCK_LOAD_WARP_TRANSPOSE>
-    BlockLoad;
-  typedef cub::BlockStore<IndexType,
-                          FLAG_ISOLATED_VERTICES_DIMX,
-                          FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD,
-                          cub::BLOCK_STORE_WARP_TRANSPOSE>
-    BlockStore;
-  typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
-  typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
-
-  __shared__ typename BlockLoad::TempStorage load_temp_storage;
-  __shared__ typename BlockStore::TempStorage store_temp_storage;
-  __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
-
-  __shared__ typename WarpReduce::TempStorage
-    warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX / FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
-
-  __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
-
-  for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * blockIdx.x);
-       block_off < n;
-       block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
-    IndexType thread_off = block_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
-    IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
-
-    IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
-    IndexType block_valid_items = n - block_off + 1;  //+1, we need row_ptr[last_node+1]
-
-    BlockLoad(load_temp_storage).Load(row_ptr + block_off, thread_row_ptr, block_valid_items, -1);
-
-    // To compute 4 degrees, we need 5 values of row_ptr
-    // Saving the "5th" value in shared memory for previous thread to use
-    if (threadIdx.x > 0) { row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; }
-
-    // If this is the last thread, it needs to load its row ptr tail value
-    if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
-      row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
-    }
-    __syncthreads();  // we may reuse temp_storage
-
-    int local_isolated_bmap = 0;
-
-    IndexType imax = (n > thread_off) ? (n - thread_off) : 0;
-
-    IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
-
-#pragma unroll
-    for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
-      IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
-
-      if (i < imax) local_isolated_bmap |= ((degree == 0) << i);
-    }
-
-    if (last_node_thread < n) {
-      IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
-        row_ptr_tail[threadIdx.x] - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
-
-      local_isolated_bmap |= ((degree == 0) << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
-    }
-
-    local_isolated_bmap <<= (thread_off % INT_SIZE);
-
-    IndexType local_nisolated = __popc(local_isolated_bmap);
-
-    // We need local_nisolated and local_isolated_bmap to be ready for next
-    // steps
-    __syncthreads();
-
-    IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
-
-    if (threadIdx.x == 0 && total_nisolated) { traversal::atomicAdd(nisolated, total_nisolated); }
-
-    int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
-
-    // Building int for bmap
-    int int_aggregate_isolated_bmap =
-      WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(local_isolated_bmap, BitwiseOr());
-
-    int is_head_of_visited_int = ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
-    if (is_head_of_visited_int && (thread_off / INT_SIZE) < (n + INT_SIZE - 1) / INT_SIZE) {
-      isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
-    }
-
-    BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items - 1);
-  }
-}
-
-template <typename IndexType>
-void flag_isolated_vertices(IndexType n,
-                            int* isolated_bmap,
-                            const IndexType* row_ptr,
-                            IndexType* degrees,
-                            IndexType* nisolated,
-                            cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = FLAG_ISOLATED_VERTICES_DIMX;
-
-  grid.x = min((IndexType)MAXBLOCKS,
-               (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
-
-  flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(
-    n, isolated_bmap, row_ptr, degrees, nisolated);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-template <typename IndexType>
-__global__ void set_frontier_degree_kernel(IndexType* frontier_degree,
-                                           IndexType* frontier,
-                                           const IndexType* degree,
-                                           IndexType n)
-{
-  for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n;
-       idx += gridDim.x * blockDim.x) {
-    IndexType u          = frontier[idx];
-    frontier_degree[idx] = degree[u];
-  }
-}
-
-template <typename IndexType>
-void set_frontier_degree(IndexType* frontier_degree,
-                         IndexType* frontier,
-                         const IndexType* degree,
-                         IndexType n,
-                         cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = 256;
-  grid.x  = min((n + block.x - 1) / block.x, (IndexType)MAXBLOCKS);
-  set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree, frontier, degree, n);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-template <typename IndexType>
-void exclusive_sum(void* d_temp_storage,
-                   size_t temp_storage_bytes,
-                   IndexType* d_in,
-                   IndexType* d_out,
-                   IndexType num_items,
-                   cudaStream_t m_stream)
-{
-  if (num_items <= 1) return;  // DeviceScan fails if n==1
-  cub::DeviceScan::ExclusiveSum(
-    d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, m_stream);
-}
-
-template <typename IndexType>
-void exclusive_sum(IndexType* d_in, IndexType* d_out, IndexType num_items, cudaStream_t m_stream)
-{
-  if (num_items <= 1) return;  // DeviceScan fails if n==1
-  thrust::exclusive_scan(rmm::exec_policy(m_stream), d_in, d_in + num_items, d_out);
-}
-
-//
-// compute_bucket_offsets_kernel
-// simply compute the position in the frontier corresponding all valid edges
-// with index=TOP_DOWN_BUCKET_SIZE * k, k integer
-//
-
-template <typename IndexType>
-__global__ void compute_bucket_offsets_kernel(const IndexType* frontier_degrees_exclusive_sum,
-                                              IndexType* bucket_offsets,
-                                              const IndexType frontier_size,
-                                              IndexType total_degree)
-{
-  IndexType end =
-    ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX * NBUCKETS_PER_BLOCK + 1);
-
-  for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x; bid <= end;
-       bid += gridDim.x * blockDim.x) {
-    IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
-
-    bucket_offsets[bid] =
-      binsearch_maxle(frontier_degrees_exclusive_sum, eid, (IndexType)0, frontier_size - 1);
-  }
-}
-
-template <typename IndexType>
-void compute_bucket_offsets(IndexType* cumul,
-                            IndexType* bucket_offsets,
-                            IndexType frontier_size,
-                            IndexType total_degree,
-                            cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
-
-  grid.x =
-    min((IndexType)MAXBLOCKS,
-        ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX * NBUCKETS_PER_BLOCK + 1 +
-         block.x - 1) /
-          block.x);
-
-  compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(
-    cumul, bucket_offsets, frontier_size, total_degree);
-  RAFT_CHECK_CUDA(m_stream);
-}
-}  // namespace traversal
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index da1e0e50919..eebd31a0030 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -239,15 +239,6 @@ ConfigureTest(GRAPH_GENERATORS_TEST generators/generators_test.cpp)
 # - erdos renyi graph generator tests -------------------------------------------------------------
 ConfigureTest(ERDOS_RENYI_GENERATOR_TEST generators/erdos_renyi_test.cpp)
 
-###################################################################################################
-# - betweenness centrality tests ------------------------------------------------------------------
-ConfigureTest(LEGACY_BETWEENNESS_TEST centrality/legacy/betweenness_centrality_test.cu)
-ConfigureTest(LEGACY_EDGE_BETWEENNESS_TEST centrality/legacy/edge_betweenness_centrality_test.cu)
-
-###################################################################################################
-# - BFS tests -------------------------------------------------------------------------------------
-ConfigureTest(LEGACY_BFS_TEST traversal/legacy/bfs_test.cu)
-
 ###################################################################################################
 # - LOUVAIN tests ---------------------------------------------------------------------------------
 ConfigureTest(LOUVAIN_TEST community/louvain_test.cpp)
diff --git a/cpp/tests/centrality/legacy/betweenness_centrality_test.cu b/cpp/tests/centrality/legacy/betweenness_centrality_test.cu
deleted file mode 100644
index 2cdbe1c98e4..00000000000
--- a/cpp/tests/centrality/legacy/betweenness_centrality_test.cu
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <traversal/legacy/bfs_ref.h>
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
-
-#include <raft/core/error.hpp>
-#include <raft/core/handle.hpp>
-#include <rmm/device_vector.hpp>
-
-#include <rmm/device_vector.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/host_vector.h>
-#include <thrust/sequence.h>
-
-#include <fstream>
-#include <queue>
-#include <stack>
-#include <utility>
-
-#ifndef TEST_EPSILON
-#define TEST_EPSILON 0.0001
-#endif
-
-// NOTE: Defines under which values the difference should  be discarded when
-// considering values are close to zero
-//  i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is
-// significant
-#ifndef TEST_ZERO_THRESHOLD
-#define TEST_ZERO_THRESHOLD 1e-10
-#endif
-
-// ============================================================================
-// C++ Reference Implementation
-// ============================================================================
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_accumulation(result_t* result,
-                      vertex_t const number_of_vertices,
-                      std::stack<vertex_t>& S,
-                      std::vector<std::vector<vertex_t>>& pred,
-                      std::vector<double>& sigmas,
-                      std::vector<double>& deltas,
-                      vertex_t source)
-{
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-    }
-    if (w != source) { result[w] += deltas[w]; }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_endpoints_accumulation(result_t* result,
-                                vertex_t const number_of_vertices,
-                                std::stack<vertex_t>& S,
-                                std::vector<std::vector<vertex_t>>& pred,
-                                std::vector<double>& sigmas,
-                                std::vector<double>& deltas,
-                                vertex_t source)
-{
-  result[source] += S.size() - 1;
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-    }
-    if (w != source) { result[w] += deltas[w] + 1; }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_edge_accumulation(result_t* result,
-                           vertex_t const number_of_vertices,
-                           std::stack<vertex_t>& S,
-                           std::vector<std::vector<vertex_t>>& pred,
-                           std::vector<double>& sigmas,
-                           std::vector<double>& deltas,
-                           vertex_t source)
-{
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-    }
-    if (w != source) { result[w] += deltas[w]; }
-  }
-}
-
-// Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001)
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_betweenness_centrality_impl(vertex_t* indices,
-                                           edge_t* offsets,
-                                           vertex_t const number_of_vertices,
-                                           result_t* result,
-                                           bool endpoints,
-                                           vertex_t const* sources,
-                                           vertex_t const number_of_sources)
-{
-  std::queue<vertex_t> Q;
-  std::stack<vertex_t> S;
-  // NOTE: dist is of type vertex_t not weight_t
-  std::vector<vertex_t> dist(number_of_vertices);
-  std::vector<std::vector<vertex_t>> pred(number_of_vertices);
-  std::vector<double> sigmas(number_of_vertices);
-  std::vector<double> deltas(number_of_vertices);
-
-  std::vector<vertex_t> neighbors;
-
-  if (sources) {
-    for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) {
-      vertex_t s = sources[source_idx];
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      if (endpoints) {
-        ref_endpoints_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      } else {
-        ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      }
-    }
-  } else {
-    for (vertex_t s = 0; s < number_of_vertices; ++s) {
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      if (endpoints) {
-        ref_endpoints_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      } else {
-        ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      }
-    }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_rescale(result_t* result,
-                       bool directed,
-                       bool normalize,
-                       bool endpoints,
-                       vertex_t const number_of_vertices,
-                       vertex_t const number_of_sources)
-{
-  bool modified                      = false;
-  result_t rescale_factor            = static_cast<result_t>(1);
-  result_t casted_number_of_sources  = static_cast<result_t>(number_of_sources);
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices);
-  if (normalize) {
-    if (number_of_vertices > 2) {
-      if (endpoints) {
-        rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1));
-      } else {
-        rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2));
-      }
-      modified = true;
-    }
-  } else {
-    if (!directed) {
-      rescale_factor /= static_cast<result_t>(2);
-      modified = true;
-    }
-  }
-  if (modified) {
-    if (number_of_sources > 0) {
-      rescale_factor *= (casted_number_of_vertices / casted_number_of_sources);
-    }
-  }
-  for (auto idx = 0; idx < number_of_vertices; ++idx) {
-    result[idx] *= rescale_factor;
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_betweenness_centrality(
-  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-  result_t* result,
-  bool normalize,
-  bool endpoints,  // This is not yet implemented
-  vertex_t const number_of_sources,
-  vertex_t const* sources)
-{
-  vertex_t number_of_vertices = graph.number_of_vertices;
-  edge_t number_of_edges      = graph.number_of_edges;
-  thrust::host_vector<vertex_t> h_indices(number_of_edges);
-  thrust::host_vector<edge_t> h_offsets(number_of_vertices + 1);
-
-  thrust::device_ptr<vertex_t> d_indices((vertex_t*)&graph.indices[0]);
-  thrust::device_ptr<edge_t> d_offsets((edge_t*)&graph.offsets[0]);
-
-  thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin());
-  thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin());
-
-  cudaDeviceSynchronize();
-
-  reference_betweenness_centrality_impl<vertex_t, edge_t, weight_t, result_t>(&h_indices[0],
-                                                                              &h_offsets[0],
-                                                                              number_of_vertices,
-                                                                              result,
-                                                                              endpoints,
-                                                                              sources,
-                                                                              number_of_sources);
-  reference_rescale<vertex_t, edge_t, weight_t, result_t>(
-    result, graph.prop.directed, normalize, endpoints, number_of_vertices, number_of_sources);
-}
-// Explicit instantiation
-/*    FIXME!!!
-template void reference_betweenness_centrality<int, int, float, float>(
-  cugraph::legacy::GraphCSRView<int, int, float> const &,
-  float *,
-  bool,
-  bool,
-  const int,
-  int const *);
-template void reference_betweenness_centrality<int, int, double, double>(
-  cugraph::legacy::GraphCSRView<int, int, double> const &,
-  double *,
-  bool,
-  bool,
-  const int,
-  int const *);
-*/
-
-// =============================================================================
-// Utility functions
-// =============================================================================
-// Compare while allowing relatie error of epsilon
-// zero_threshold indicates when  we should drop comparison for small numbers
-template <typename T, typename precision_t>
-bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
-{
-  return ((zero_threshold > a && zero_threshold > b)) ||
-         (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
-}
-
-// =============================================================================
-// Test Suite
-// =============================================================================
-// Defines Betweenness Centrality UseCase
-// SSSP's test suite code uses type of Graph parameter that could be used
-// (MTX / RMAT)
-typedef struct BC_Usecase_t {
-  std::string config_;     // Path to graph file
-  std::string file_path_;  // Complete path to graph using dataset_root_dir
-  int number_of_sources_;  // Starting point from the traversal
-  BC_Usecase_t(const std::string& config, int number_of_sources)
-    : config_(config), number_of_sources_(number_of_sources)
-  {
-    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
-    // FIXME: Use platform independent stuff from c++14/17 on compiler update
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((config_ != "") && (config_[0] != '/')) {
-      file_path_ = rapidsDatasetRootDir + "/" + config_;
-    } else {
-      file_path_ = config_;
-    }
-  };
-} BC_Usecase;
-
-class Tests_BC : public ::testing::TestWithParam<BC_Usecase> {
-  raft::handle_t handle;
-
- public:
-  Tests_BC() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  // vertex_t         vertex identifier data type
-  // edge_t         edge identifier data type
-  // weight_t         edge weight data type
-  // result_t   result data type
-  // normalize  should the result be normalized
-  // endpoints  should the endpoints be included
-  template <typename vertex_t,
-            typename edge_t,
-            typename weight_t,
-            typename result_t,
-            bool normalize,
-            bool endpoints>
-  void run_current_test(const BC_Usecase& configuration)
-  {
-    // Step 1: Construction of the graph based on configuration
-    bool is_directed = false;
-    auto csr         = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
-      is_directed, configuration.file_path_);
-    cudaDeviceSynchronize();
-    cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
-    G.prop.directed                                             = is_directed;
-    RAFT_CUDA_TRY(cudaGetLastError());
-    std::vector<result_t> result(G.number_of_vertices, 0);
-    std::vector<result_t> expected(G.number_of_vertices, 0);
-
-    // Step 2: Generation of sources based on configuration
-    //         if number_of_sources_ is 0 then sources must be nullptr
-    //         Otherwise we only  use the first k values
-    ASSERT_TRUE(configuration.number_of_sources_ >= 0 &&
-                configuration.number_of_sources_ <= G.number_of_vertices)
-      << "Number number of sources should be >= 0 and"
-      << " less than the number of vertices in the graph";
-    std::vector<vertex_t> sources(configuration.number_of_sources_);
-    thrust::sequence(thrust::host, sources.begin(), sources.end(), 0);
-
-    vertex_t* sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    reference_betweenness_centrality(
-      G, expected.data(), normalize, endpoints, configuration.number_of_sources_, sources_ptr);
-
-    sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    rmm::device_vector<result_t> d_result(G.number_of_vertices);
-    cugraph::betweenness_centrality(handle,
-                                    G,
-                                    d_result.data().get(),
-                                    normalize,
-                                    endpoints,
-                                    static_cast<weight_t*>(nullptr),
-                                    configuration.number_of_sources_,
-                                    sources_ptr);
-    cudaDeviceSynchronize();
-    RAFT_CUDA_TRY(cudaMemcpy(result.data(),
-                             d_result.data().get(),
-                             sizeof(result_t) * G.number_of_vertices,
-                             cudaMemcpyDeviceToHost));
-    cudaDeviceSynchronize();
-    for (int i = 0; i < G.number_of_vertices; ++i)
-      EXPECT_TRUE(compare_close(result[i], expected[i], TEST_EPSILON, TEST_ZERO_THRESHOLD))
-        << "[MISMATCH] vaid = " << i << ", cugraph = " << result[i]
-        << " expected = " << expected[i];
-  }
-};
-
-// ============================================================================
-// Tests
-// ============================================================================
-// Verifiy Un-Normalized results
-TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, false, false>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, false, false>(GetParam());
-}
-
-TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, false, true>(GetParam());
-}
-#endif
-
-TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, false, true>(GetParam());
-}
-
-// Verifiy Normalized results
-TEST_P(Tests_BC, CheckFP32_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, true, false>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-TEST_P(Tests_BC, CheckFP64_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, true, false>(GetParam());
-}
-
-TEST_P(Tests_BC, CheckFP32_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, true, true>(GetParam());
-}
-#endif
-
-TEST_P(Tests_BC, CheckFP64_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, true, true>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_BC,
-                         ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 4),
-                                           BC_Usecase("test/datasets/wiki2003.mtx", 4),
-                                           BC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
-#else
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_BC,
-                         ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 4)));
-#endif
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu b/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu
deleted file mode 100644
index 153993deda7..00000000000
--- a/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <traversal/legacy/bfs_ref.h>
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <raft/core/error.hpp>
-#include <raft/core/handle.hpp>
-#include <rmm/device_vector.hpp>
-
-#include <rmm/device_vector.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/host_vector.h>
-#include <thrust/sequence.h>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
-
-#include <fstream>
-#include <queue>
-#include <stack>
-#include <utility>
-
-#ifndef TEST_EPSILON
-#define TEST_EPSILON 0.0001
-#endif
-
-// NOTE: Defines under which values the difference should  be discarded when
-// considering values are close to zero
-//  i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is
-// significant
-#ifndef TEST_ZERO_THRESHOLD
-#define TEST_ZERO_THRESHOLD 1e-10
-#endif
-
-// ============================================================================
-// C++ Reference Implementation
-// ============================================================================
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-edge_t get_edge_index_from_source_and_destination(vertex_t source_vertex,
-                                                  vertex_t destination_vertex,
-                                                  vertex_t const* indices,
-                                                  edge_t const* offsets)
-{
-  edge_t index          = -1;
-  edge_t first_edge_idx = offsets[source_vertex];
-  edge_t last_edge_idx  = offsets[source_vertex + 1];
-  auto index_it = std::find(indices + first_edge_idx, indices + last_edge_idx, destination_vertex);
-  if (index_it != (indices + last_edge_idx)) { index = std::distance(indices, index_it); }
-  return index;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_accumulation(result_t* result,
-                      vertex_t const* indices,
-                      edge_t const* offsets,
-                      vertex_t const number_of_vertices,
-                      std::stack<vertex_t>& S,
-                      std::vector<std::vector<vertex_t>>& pred,
-                      std::vector<double>& sigmas,
-                      std::vector<double>& deltas,
-                      vertex_t source)
-{
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      edge_t edge_idx =
-        get_edge_index_from_source_and_destination<vertex_t, edge_t, weight_t, result_t>(
-          v, w, indices, offsets);
-      double coefficient = (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-
-      deltas[v] += coefficient;
-      result[edge_idx] += coefficient;
-    }
-  }
-}
-
-// Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001)
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_edge_betweenness_centrality_impl(vertex_t* indices,
-                                                edge_t* offsets,
-                                                vertex_t const number_of_vertices,
-                                                result_t* result,
-                                                vertex_t const* sources,
-                                                vertex_t const number_of_sources)
-{
-  std::queue<vertex_t> Q;
-  std::stack<vertex_t> S;
-  // NOTE: dist is of type vertex_t not weight_t
-  std::vector<vertex_t> dist(number_of_vertices);
-  std::vector<std::vector<vertex_t>> pred(number_of_vertices);
-  std::vector<double> sigmas(number_of_vertices);
-  std::vector<double> deltas(number_of_vertices);
-
-  std::vector<vertex_t> neighbors;
-
-  if (sources) {
-    for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) {
-      vertex_t s = sources[source_idx];
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-        result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s);
-    }
-  } else {
-    for (vertex_t s = 0; s < number_of_vertices; ++s) {
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-        result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s);
-    }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_rescale(result_t* result,
-                       bool directed,
-                       bool normalize,
-                       vertex_t const number_of_vertices,
-                       edge_t const number_of_edges)
-{
-  result_t rescale_factor            = static_cast<result_t>(1);
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices);
-  if (normalize) {
-    if (number_of_vertices > 1) {
-      rescale_factor /= ((casted_number_of_vertices) * (casted_number_of_vertices - 1));
-    }
-  } else {
-    if (!directed) { rescale_factor /= static_cast<result_t>(2); }
-  }
-  for (auto idx = 0; idx < number_of_edges; ++idx) {
-    result[idx] *= rescale_factor;
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_edge_betweenness_centrality(
-  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-  result_t* result,
-  bool normalize,
-  vertex_t const number_of_sources,
-  vertex_t const* sources)
-{
-  vertex_t number_of_vertices = graph.number_of_vertices;
-  edge_t number_of_edges      = graph.number_of_edges;
-  thrust::host_vector<vertex_t> h_indices(number_of_edges);
-  thrust::host_vector<edge_t> h_offsets(number_of_vertices + 1);
-
-  thrust::device_ptr<vertex_t> d_indices((vertex_t*)&graph.indices[0]);
-  thrust::device_ptr<edge_t> d_offsets((edge_t*)&graph.offsets[0]);
-
-  thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin());
-  thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin());
-
-  cudaDeviceSynchronize();
-
-  reference_edge_betweenness_centrality_impl<vertex_t, edge_t, weight_t, result_t>(
-    &h_indices[0], &h_offsets[0], number_of_vertices, result, sources, number_of_sources);
-  reference_rescale<vertex_t, edge_t, weight_t, result_t>(
-    result, graph.prop.directed, normalize, number_of_vertices, number_of_edges);
-}
-
-// =============================================================================
-// Utility functions
-// =============================================================================
-// Compare while allowing relatie error of epsilon
-// zero_threshold indicates when  we should drop comparison for small numbers
-template <typename T, typename precision_t>
-bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
-{
-  return ((zero_threshold > a && zero_threshold > b)) ||
-         (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
-}
-
-// =============================================================================
-// Test Suite
-// =============================================================================
-// Defines Betweenness Centrality UseCase
-// SSSP's test suite code uses type of Graph parameter that could be used
-// (MTX / RMAT)
-typedef struct EdgeBC_Usecase_t {
-  std::string config_;     // Path to graph file
-  std::string file_path_;  // Complete path to graph using dataset_root_dir
-  int number_of_sources_;  // Starting point from the traversal
-  EdgeBC_Usecase_t(const std::string& config, int number_of_sources)
-    : config_(config), number_of_sources_(number_of_sources)
-  {
-    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
-    // FIXME: Use platform independent stuff from c++14/17 on compiler update
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((config_ != "") && (config_[0] != '/')) {
-      file_path_ = rapidsDatasetRootDir + "/" + config_;
-    } else {
-      file_path_ = config_;
-    }
-  };
-} EdgeBC_Usecase;
-
-class Tests_EdgeBC : public ::testing::TestWithParam<EdgeBC_Usecase> {
-  raft::handle_t handle;
-
- public:
-  Tests_EdgeBC() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  // FIXME: Should normalize be part of the configuration instead?
-  // vertex_t         vertex identifier data type
-  // edge_t         edge identifier data type
-  // weight_t         edge weight data type
-  // result_t   result data type
-  // normalize  should the result be normalized
-  template <typename vertex_t,
-            typename edge_t,
-            typename weight_t,
-            typename result_t,
-            bool normalize>
-  void run_current_test(const EdgeBC_Usecase& configuration)
-  {
-    // Step 1: Construction of the graph based on configuration
-    bool is_directed = false;
-    auto csr         = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
-      is_directed, configuration.file_path_);
-    cudaDeviceSynchronize();
-    cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
-    G.prop.directed                                             = is_directed;
-    RAFT_CUDA_TRY(cudaGetLastError());
-    std::vector<result_t> result(G.number_of_edges, 0);
-    std::vector<result_t> expected(G.number_of_edges, 0);
-
-    // Step 2: Generation of sources based on configuration
-    //         if number_of_sources_ is 0 then sources must be nullptr
-    //         Otherwise we only  use the first k values
-    ASSERT_TRUE(configuration.number_of_sources_ >= 0 &&
-                configuration.number_of_sources_ <= G.number_of_vertices)
-      << "Number number of sources should be >= 0 and"
-      << " less than the number of vertices in the graph";
-    std::vector<vertex_t> sources(configuration.number_of_sources_);
-    thrust::sequence(thrust::host, sources.begin(), sources.end(), 0);
-
-    vertex_t* sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    reference_edge_betweenness_centrality(
-      G, expected.data(), normalize, configuration.number_of_sources_, sources_ptr);
-
-    sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    rmm::device_vector<result_t> d_result(G.number_of_edges);
-    cugraph::edge_betweenness_centrality(handle,
-                                         G,
-                                         d_result.data().get(),
-                                         normalize,
-                                         static_cast<weight_t*>(nullptr),
-                                         configuration.number_of_sources_,
-                                         sources_ptr);
-    RAFT_CUDA_TRY(cudaMemcpy(result.data(),
-                             d_result.data().get(),
-                             sizeof(result_t) * G.number_of_edges,
-                             cudaMemcpyDeviceToHost));
-    for (int i = 0; i < G.number_of_edges; ++i)
-      EXPECT_TRUE(compare_close(result[i], expected[i], TEST_EPSILON, TEST_ZERO_THRESHOLD))
-        << "[MISMATCH] vaid = " << i << ", cugraph = " << result[i]
-        << " expected = " << expected[i];
-  }
-};
-
-// ============================================================================
-// Tests
-// ============================================================================
-// Verifiy Un-Normalized results
-TEST_P(Tests_EdgeBC, CheckFP32_NO_NORMALIZE)
-{
-  run_current_test<int, int, float, float, false>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-TEST_P(Tests_EdgeBC, CheckFP64_NO_NORMALIZE)
-{
-  run_current_test<int, int, double, double, false>(GetParam());
-}
-
-// Verifiy Normalized results
-TEST_P(Tests_EdgeBC, CheckFP32_NORMALIZE)
-{
-  run_current_test<int, int, float, float, true>(GetParam());
-}
-#endif
-
-TEST_P(Tests_EdgeBC, CheckFP64_NORMALIZE)
-{
-  run_current_test<int, int, double, double, true>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_EdgeBC,
-                         ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 4),
-                                           EdgeBC_Usecase("test/datasets/wiki2003.mtx", 4),
-                                           EdgeBC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
-#else
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_EdgeBC,
-                         ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 4)));
-#endif
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/legacy/bfs_ref.h b/cpp/tests/traversal/legacy/bfs_ref.h
deleted file mode 100644
index 5efdce818e7..00000000000
--- a/cpp/tests/traversal/legacy/bfs_ref.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <limits>
-#include <queue>
-#include <stack>
-#include <vector>
-
-template <typename VT, typename ET>
-void populate_neighbors(VT* indices, ET* offsets, VT w, std::vector<VT>& neighbors)
-{
-  ET edge_start = offsets[w];
-  ET edge_end   = offsets[w + 1];
-
-  neighbors.assign(indices + edge_start, indices + edge_end);
-}
-
-// This implements the BFS based on (Brandes, 2001) for shortest path counting
-template <typename VT, typename ET>
-void ref_bfs(VT* indices,
-             ET* offsets,
-             VT const number_of_vertices,
-             std::queue<VT>& Q,
-             std::stack<VT>& S,
-             std::vector<VT>& dist,
-             std::vector<std::vector<VT>>& pred,
-             std::vector<double>& sigmas,
-             VT source)
-{
-  std::vector<VT> neighbors;
-  pred.clear();
-  pred.resize(number_of_vertices);
-  dist.assign(number_of_vertices, std::numeric_limits<VT>::max());
-  sigmas.assign(number_of_vertices, 0);
-  dist[source]   = 0;
-  sigmas[source] = 1;
-  Q.push(source);
-  //   b. Traversal
-  while (!Q.empty()) {
-    VT v = Q.front();
-    Q.pop();
-    S.push(v);
-    populate_neighbors<VT, ET>(indices, offsets, v, neighbors);
-    for (VT w : neighbors) {
-      // Path Discovery:
-      // Found for the first time?
-      if (dist[w] == std::numeric_limits<VT>::max()) {
-        dist[w] = dist[v] + 1;
-        Q.push(w);
-      }
-      // Path counting
-      // Edge(v, w) on  a shortest path?
-      if (dist[w] == dist[v] + 1) {
-        sigmas[w] += sigmas[v];
-        pred[w].push_back(v);
-      }
-    }
-  }
-}
diff --git a/cpp/tests/traversal/legacy/bfs_test.cu b/cpp/tests/traversal/legacy/bfs_test.cu
deleted file mode 100644
index c6d3c96aa93..00000000000
--- a/cpp/tests/traversal/legacy/bfs_test.cu
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governin_from_mtxg permissions and
- * limitations under the License.
- */
-
-#include "bfs_ref.h"
-
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <cugraph/algorithms.hpp>
-
-#include <rmm/device_vector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <raft/core/handle.hpp>
-
-#include <queue>
-#include <stack>
-#include <vector>
-
-// NOTE: This could be common to other files but we might not want the same precision
-// depending on the algorithm
-#ifndef TEST_EPSILON  // It is currently use for relative error
-#define TEST_EPSILON 0.0001
-#endif
-
-// NOTE: Defines under which values the difference should  be discarded when
-// considering values are close to zero
-//  i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is
-// significant
-#ifndef TEST_ZERO_THRESHOLD
-#define TEST_ZERO_THRESHOLD 1e-10
-#endif
-// ============================================================================
-// C++ Reference Implementation
-// ============================================================================
-template <typename T, typename precision_t>
-bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
-{
-  return ((zero_threshold > a && zero_threshold > b)) ||
-         (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
-}
-
-// ============================================================================
-// Test Suite
-// ============================================================================
-typedef struct BFS_Usecase_t {
-  std::string config_;     // Path to graph file
-  std::string file_path_;  // Complete path to graph using dataset_root_dir
-  int source_;             // Starting point from the traversal
-  BFS_Usecase_t(const std::string& config, int source) : config_(config), source_(source)
-  {
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((config_ != "") && (config_[0] != '/')) {
-      file_path_ = rapidsDatasetRootDir + "/" + config_;
-    } else {
-      file_path_ = config_;
-    }
-  };
-} BFS_Usecase;
-
-class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
-  raft::handle_t handle;
-
- public:
-  Tests_BFS() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  // VT                 vertex identifier data type
-  // ET                 edge identifier data type
-  // WT                 edge weight data type
-  // return_sp_counter  should BFS return shortest path countner
-  template <typename VT, typename ET, typename WT, bool return_sp_counter>
-  void run_current_test(const BFS_Usecase& configuration)
-  {
-    // Step 1: Construction of the graph based on configuration
-    VT number_of_vertices;
-    ET number_of_edges;
-    bool directed = false;
-    auto csr =
-      cugraph::test::generate_graph_csr_from_mm<VT, ET, WT>(directed, configuration.file_path_);
-    cudaDeviceSynchronize();
-    cugraph::legacy::GraphCSRView<VT, ET, WT> G = csr->view();
-    G.prop.directed                             = directed;
-
-    ASSERT_TRUE(configuration.source_ >= 0 && (VT)configuration.source_ < G.number_of_vertices)
-      << "Starting sources should be >= 0 and"
-      << " less than the number of vertices in the graph";
-
-    VT source = configuration.source_;
-
-    number_of_vertices = G.number_of_vertices;
-    number_of_edges    = G.number_of_edges;
-
-    std::vector<VT> indices(number_of_edges);
-    std::vector<ET> offsets(number_of_vertices + 1);
-
-    RAFT_CUDA_TRY(
-      cudaMemcpy(indices.data(), G.indices, sizeof(VT) * indices.size(), cudaMemcpyDeviceToHost));
-    RAFT_CUDA_TRY(
-      cudaMemcpy(offsets.data(), G.offsets, sizeof(ET) * offsets.size(), cudaMemcpyDeviceToHost));
-
-    std::queue<VT> Q;
-    std::stack<VT> S;
-    std::vector<VT> ref_bfs_dist(number_of_vertices);
-    std::vector<std::vector<VT>> ref_bfs_pred(number_of_vertices);
-    std::vector<double> ref_bfs_sigmas(number_of_vertices);
-
-    ref_bfs<VT, ET>(indices.data(),
-                    offsets.data(),
-                    number_of_vertices,
-                    Q,
-                    S,
-                    ref_bfs_dist,
-                    ref_bfs_pred,
-                    ref_bfs_sigmas,
-                    source);
-
-    // Device data for cugraph_bfs
-    rmm::device_vector<VT> d_cugraph_dist(number_of_vertices);
-    rmm::device_vector<VT> d_cugraph_pred(number_of_vertices);
-    rmm::device_vector<double> d_cugraph_sigmas(number_of_vertices);
-
-    std::vector<VT> cugraph_dist(number_of_vertices);
-    std::vector<VT> cugraph_pred(number_of_vertices);
-    std::vector<double> cugraph_sigmas(number_of_vertices);
-
-    // Don't pass valid sp_sp_counter ptr unless needed because it disables
-    // the bottom up flow
-    cugraph::bfs<VT, ET, WT>(handle,
-                             G,
-                             d_cugraph_dist.data().get(),
-                             d_cugraph_pred.data().get(),
-                             (return_sp_counter) ? d_cugraph_sigmas.data().get() : nullptr,
-                             source,
-                             G.prop.directed);
-    RAFT_CUDA_TRY(cudaMemcpy(cugraph_dist.data(),
-                             d_cugraph_dist.data().get(),
-                             sizeof(VT) * d_cugraph_dist.size(),
-                             cudaMemcpyDeviceToHost));
-    RAFT_CUDA_TRY(cudaMemcpy(cugraph_pred.data(),
-                             d_cugraph_pred.data().get(),
-                             sizeof(VT) * d_cugraph_pred.size(),
-                             cudaMemcpyDeviceToHost));
-
-    if (return_sp_counter) {
-      RAFT_CUDA_TRY(cudaMemcpy(cugraph_sigmas.data(),
-                               d_cugraph_sigmas.data().get(),
-                               sizeof(double) * d_cugraph_sigmas.size(),
-                               cudaMemcpyDeviceToHost));
-    }
-
-    for (VT i = 0; i < number_of_vertices; ++i) {
-      // Check distances: should be an exact match as we use signed int 32-bit
-      EXPECT_EQ(cugraph_dist[i], ref_bfs_dist[i])
-        << "[MISMATCH] vaid = " << i << ", cugraph = " << cugraph_sigmas[i]
-        << " c++ ref = " << ref_bfs_sigmas[i];
-      // Check predecessor: We do not enforce the predecessor, we simply verifiy
-      // that the predecessor obtained with the GPU implementation is one of the
-      // predecessors obtained during the C++ BFS traversal
-      VT pred = cugraph_pred[i];  // It could be equal to -1 if the node is never reached
-      constexpr VT invalid_vid = cugraph::legacy::invalid_vertex_id<VT>::value;
-      if (pred == invalid_vid) {
-        EXPECT_TRUE(ref_bfs_pred[i].empty())
-          << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph had not predecessor,"
-          << "while c++ ref found at least one.";
-      } else {
-        // This can get expensive to check, we could have simply verified that based
-        // on the the distance from the source to the predecessor, but this ensures that there
-        // are no misassignations
-        auto it = std::find(ref_bfs_pred[i].begin(), ref_bfs_pred[i].end(), pred);
-        EXPECT_TRUE(it != ref_bfs_pred[i].end())
-          << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph = " << cugraph_sigmas[i]
-          << " , c++ ref did not consider it as a predecessor.";
-      }
-
-      if (return_sp_counter) {
-        EXPECT_TRUE(
-          compare_close(cugraph_sigmas[i], ref_bfs_sigmas[i], TEST_EPSILON, TEST_ZERO_THRESHOLD))
-          << "[MISMATCH] vaid = " << i << ", cugraph = " << cugraph_sigmas[i]
-          << " c++ ref = " << ref_bfs_sigmas[i];
-      }
-    }
-  }
-};
-
-// ============================================================================
-// Tests
-// ============================================================================
-
-// We don't need to test WT for both float and double since it's anyway ignored in BFS
-TEST_P(Tests_BFS, CheckUint32_NO_SP_COUNTER)
-{
-  run_current_test<uint32_t, uint32_t, float, false>(GetParam());
-}
-TEST_P(Tests_BFS, CheckInt_NO_SP_COUNTER) { run_current_test<int, int, float, false>(GetParam()); }
-TEST_P(Tests_BFS, CheckInt64_NO_SP_COUNTER)
-{
-  run_current_test<int64_t, int64_t, float, false>(GetParam());
-}
-
-TEST_P(Tests_BFS, CheckUint32_SP_COUNTER)
-{
-  run_current_test<uint32_t, uint32_t, float, true>(GetParam());
-}
-TEST_P(Tests_BFS, CheckInt_SP_COUNTER) { run_current_test<int, int, float, true>(GetParam()); }
-TEST_P(Tests_BFS, CheckInt64_SP_COUNTER)
-{
-  run_current_test<int64_t, int64_t, float, true>(GetParam());
-}
-
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_BFS,
-                         ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0),
-                                           BFS_Usecase("test/datasets/polbooks.mtx", 0),
-                                           BFS_Usecase("test/datasets/netscience.mtx", 0),
-                                           BFS_Usecase("test/datasets/netscience.mtx", 100),
-                                           BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
-                                           BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000)));
-
-CUGRAPH_TEST_PROGRAM_MAIN()