diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 87d26bfd848..63a91d4971f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -195,7 +195,6 @@ set(CUGRAPH_SOURCES src/utilities/path_retrieval.cu src/structure/legacy/graph.cu src/linear_assignment/legacy/hungarian.cu - src/traversal/legacy/bfs.cu src/link_prediction/legacy/jaccard.cu src/link_prediction/legacy/overlap.cu src/link_prediction/jaccard_sg.cu @@ -234,7 +233,6 @@ set(CUGRAPH_SOURCES src/cores/k_core_sg.cu src/cores/k_core_mg.cu src/components/legacy/connectivity.cu - src/centrality/legacy/betweenness_centrality.cu src/generators/generate_rmat_edgelist.cu src/generators/generate_bipartite_rmat_edgelist.cu src/generators/generator_tools.cu diff --git a/cpp/src/centrality/legacy/betweenness_centrality.cu b/cpp/src/centrality/legacy/betweenness_centrality.cu deleted file mode 100644 index cd274a408e1..00000000000 --- a/cpp/src/centrality/legacy/betweenness_centrality.cu +++ /dev/null @@ -1,564 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include - -#include "betweenness_centrality.cuh" -#include "betweenness_centrality_kernels.cuh" -#include - -namespace cugraph { -namespace detail { -namespace { -template -void betweenness_centrality_impl(raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - bool endpoints, - weight_t const* weight, - vertex_t number_of_sources, - vertex_t const* sources, - vertex_t total_number_of_sources) -{ - // Current Implementation relies on BFS - // FIXME: For SSSP version - // Brandes Algorithm expects non negative weights for the accumulation - bool is_edge_betweenness = false; - verify_betweenness_centrality_input( - result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources); - cugraph::detail::BC bc(handle, graph); - bc.configure( - result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources); - bc.compute(); - bc.rescale_by_total_sources_used(total_number_of_sources); -} - -template -void edge_betweenness_centrality_impl(raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - weight_t const* weight, - vertex_t number_of_sources, - vertex_t const* sources, - vertex_t /* total_number_of_sources */) -{ - // Current Implementation relies on BFS - // FIXME: For SSSP version - // Brandes Algorithm expects non negative weights for the accumulation - bool is_edge_betweenness = true; - bool endpoints = false; - verify_betweenness_centrality_input( - result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources); - cugraph::detail::BC bc(handle, graph); - bc.configure( - result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources); - bc.compute(); - // NOTE: As of 07/2020 NetworkX does not apply rescaling based on number - // of sources - // bc.rescale_by_total_sources_used(total_number_of_sources); -} -template -vertex_t get_total_number_of_sources(raft::handle_t const& handle, vertex_t local_number_of_sources) -{ - vertex_t total_number_of_sources_used = local_number_of_sources; - if (handle.comms_initialized()) { - rmm::device_scalar d_number_of_sources(local_number_of_sources, handle.get_stream()); - handle.get_comms().allreduce(d_number_of_sources.data(), - d_number_of_sources.data(), - 1, - raft::comms::op_t::SUM, - handle.get_stream()); - total_number_of_sources_used = d_number_of_sources.value(handle.get_stream()); - // RAFT_CUDA_TRY( - // cudaMemcpy(&total_number_of_sources_used, data, sizeof(vertex_t), cudaMemcpyDeviceToHost)); - } - return total_number_of_sources_used; -} -} // namespace - -template -void verify_betweenness_centrality_input(result_t* result, - bool is_edge_betweenness, - bool normalize, - bool endpoints, - weight_t const* weights, - vertex_t const number_of_sources, - vertex_t const* sources) -{ - static_assert(std::is_same::value, "vertex_t should be int"); - static_assert(std::is_same::value, "edge_t should be int"); - static_assert(std::is_same::value || std::is_same::value, - "weight_t should be float or double"); - static_assert(std::is_same::value || std::is_same::value, - "result_t should be float or double"); - - CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: betwenness pointer is NULL"); - CUGRAPH_EXPECTS(number_of_sources >= 0, "Number of sources must be positive or equal to 0."); - if (number_of_sources != 0) { - CUGRAPH_EXPECTS(sources != nullptr, - "Sources cannot be NULL if number_of_source is different from 0."); - } - if (is_edge_betweenness) { - CUGRAPH_EXPECTS(!endpoints, "Endpoints is not supported for edge betweenness centrality."); - } -} - -template -void BC::setup() -{ - number_of_vertices_ = graph_.number_of_vertices; - number_of_edges_ = graph_.number_of_edges; - offsets_ptr_ = graph_.offsets; - indices_ptr_ = graph_.indices; -} - -template -void BC::configure(result_t* betweenness, - bool is_edge_betweenness, - bool normalized, - bool endpoints, - weight_t const* weights, - vertex_t const* sources, - vertex_t number_of_sources) -{ - // --- Bind betweenness output vector to internal --- - betweenness_ = betweenness; - normalized_ = normalized; - endpoints_ = endpoints; - sources_ = sources; - number_of_sources_ = number_of_sources; - edge_weights_ptr_ = weights; - is_edge_betweenness_ = is_edge_betweenness; - - // --- Working data allocation --- - initialize_work_vectors(); - initialize_pointers_to_vectors(); - - // --- Get Device Information --- - initialize_device_information(); - - // --- Confirm that configuration went through --- - configured_ = true; -} - -template -void BC::initialize_work_vectors() -{ - distances_vec_.resize(number_of_vertices_); - predecessors_vec_.resize(number_of_vertices_); - sp_counters_vec_.resize(number_of_vertices_); - deltas_vec_.resize(number_of_vertices_); -} - -template -void BC::initialize_pointers_to_vectors() -{ - distances_ = distances_vec_.data().get(); - predecessors_ = predecessors_vec_.data().get(); - sp_counters_ = sp_counters_vec_.data().get(); - deltas_ = deltas_vec_.data().get(); -} - -template -void BC::initialize_device_information() -{ - max_grid_dim_1D_ = handle_.get_device_properties().maxGridSize[0]; - max_block_dim_1D_ = handle_.get_device_properties().maxThreadsDim[0]; -} - -template -void BC::compute() -{ - CUGRAPH_EXPECTS(configured_, "BC must be configured before computation"); - if (sources_) { - for (vertex_t source_idx = 0; source_idx < number_of_sources_; ++source_idx) { - vertex_t source_vertex = sources_[source_idx]; - compute_single_source(source_vertex); - } - } else { - for (vertex_t source_vertex = 0; source_vertex < number_of_vertices_; ++source_vertex) { - compute_single_source(source_vertex); - } - } - rescale(); -} - -template -void BC::compute_single_source(vertex_t source_vertex) -{ - // Step 1) Singe-source shortest-path problem - cugraph::bfs(handle_, - graph_, - distances_, - predecessors_, - sp_counters_, - source_vertex, - graph_.prop.directed, - true); - - // FIXME: Remove that with a BC specific class to gather - // information during traversal - - // Numeric max value is replaced by -1 as we look for the maximal depth of - // the traversal, this value is avalaible within the bfs implementation and - // there could be a way to access it directly and avoid both replace and the - // max - thrust::replace(handle_.get_thrust_policy(), - distances_, - distances_ + number_of_vertices_, - std::numeric_limits::max(), - static_cast(-1)); - auto current_max_depth = - thrust::max_element(handle_.get_thrust_policy(), distances_, distances_ + number_of_vertices_); - vertex_t max_depth = 0; - RAFT_CUDA_TRY( - cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost)); - // Step 2) Dependency accumulation - accumulate(source_vertex, max_depth); -} - -template -void BC::accumulate(vertex_t source_vertex, - vertex_t max_depth) -{ - dim3 grid_configuration, block_configuration; - block_configuration.x = max_block_dim_1D_; - grid_configuration.x = min(max_grid_dim_1D_, (number_of_edges_ / block_configuration.x + 1)); - - initialize_dependencies(); - - if (is_edge_betweenness_) { - accumulate_edges(max_depth, grid_configuration, block_configuration); - } else if (endpoints_) { - accumulate_vertices_with_endpoints( - source_vertex, max_depth, grid_configuration, block_configuration); - } else { - accumulate_vertices(max_depth, grid_configuration, block_configuration); - } -} - -template -void BC::initialize_dependencies() -{ - thrust::fill( - handle_.get_thrust_policy(), deltas_, deltas_ + number_of_vertices_, static_cast(0)); -} -template -void BC::accumulate_edges(vertex_t max_depth, - dim3 grid_configuration, - dim3 block_configuration) -{ - for (vertex_t depth = max_depth; depth >= 0; --depth) { - edges_accumulation_kernel - <<>>(betweenness_, - number_of_vertices_, - graph_.indices, - graph_.offsets, - distances_, - sp_counters_, - deltas_, - depth); - } -} - -template -void BC::accumulate_vertices_with_endpoints( - vertex_t source_vertex, vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration) -{ - for (vertex_t depth = max_depth; depth > 0; --depth) { - endpoints_accumulation_kernel - <<>>(betweenness_, - number_of_vertices_, - graph_.indices, - graph_.offsets, - distances_, - sp_counters_, - deltas_, - depth); - } - add_reached_endpoints_to_source_betweenness(source_vertex); - add_vertices_dependencies_to_betweenness(); -} - -// Distances should contain -1 for unreached nodes, - -// FIXME: There might be a cleaner way to add a value to a single -// score in the betweenness vector -template -void BC::add_reached_endpoints_to_source_betweenness( - vertex_t source_vertex) -{ - vertex_t number_of_unvisited_vertices = - thrust::count(handle_.get_thrust_policy(), distances_, distances_ + number_of_vertices_, -1); - vertex_t number_of_visited_vertices_except_source = - number_of_vertices_ - number_of_unvisited_vertices - 1; - rmm::device_vector buffer(1); - buffer[0] = number_of_visited_vertices_except_source; - thrust::transform(handle_.get_thrust_policy(), - buffer.begin(), - buffer.end(), - betweenness_ + source_vertex, - betweenness_ + source_vertex, - thrust::plus()); -} - -template -void BC::add_vertices_dependencies_to_betweenness() -{ - thrust::transform(handle_.get_thrust_policy(), - deltas_, - deltas_ + number_of_vertices_, - betweenness_, - betweenness_, - thrust::plus()); -} - -template -void BC::accumulate_vertices(vertex_t max_depth, - dim3 grid_configuration, - dim3 block_configuration) -{ - for (vertex_t depth = max_depth; depth > 0; --depth) { - accumulation_kernel - <<>>(betweenness_, - number_of_vertices_, - graph_.indices, - graph_.offsets, - distances_, - sp_counters_, - deltas_, - depth); - } - add_vertices_dependencies_to_betweenness(); -} - -template -void BC::rescale() -{ - bool modified = false; - result_t rescale_factor = static_cast(1); - if (normalized_) { - if (is_edge_betweenness_) { - std::tie(rescale_factor, modified) = - rescale_edges_betweenness_centrality(rescale_factor, modified); - } else { - std::tie(rescale_factor, modified) = - rescale_vertices_betweenness_centrality(rescale_factor, modified); - } - } else { - if (!graph_.prop.directed) { - rescale_factor /= static_cast(2); - modified = true; - } - } - apply_rescale_factor_to_betweenness(rescale_factor); -} - -template -std::tuple -BC::rescale_edges_betweenness_centrality( - result_t rescale_factor, bool modified) -{ - result_t casted_number_of_vertices_ = static_cast(number_of_vertices_); - if (number_of_vertices_ > 1) { - rescale_factor /= ((casted_number_of_vertices_) * (casted_number_of_vertices_ - 1)); - modified = true; - } - return std::make_tuple(rescale_factor, modified); -} - -template -std::tuple -BC::rescale_vertices_betweenness_centrality( - result_t rescale_factor, bool modified) -{ - result_t casted_number_of_vertices = static_cast(number_of_vertices_); - if (number_of_vertices_ > 2) { - if (endpoints_) { - rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1)); - } else { - rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2)); - } - modified = true; - } - return std::make_tuple(rescale_factor, modified); -} - -template -void BC::apply_rescale_factor_to_betweenness( - result_t rescale_factor) -{ - size_t result_size = number_of_vertices_; - if (is_edge_betweenness_) result_size = number_of_edges_; - thrust::transform(handle_.get_thrust_policy(), - betweenness_, - betweenness_ + result_size, - thrust::make_constant_iterator(rescale_factor), - betweenness_, - thrust::multiplies()); -} - -template -void BC::rescale_by_total_sources_used( - vertex_t total_number_of_sources_used) -{ - result_t rescale_factor = static_cast(1); - result_t casted_total_number_of_sources_used = - static_cast(total_number_of_sources_used); - result_t casted_number_of_vertices = static_cast(number_of_vertices_); - - if (normalized_) { - if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) { - rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used); - } - } else if (!graph_.prop.directed) { - if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) { - rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used); - } - } - apply_rescale_factor_to_betweenness(rescale_factor); -} -} // namespace detail - -template -void betweenness_centrality(raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - bool endpoints, - weight_t const* weight, - vertex_t k, - vertex_t const* vertices) -{ - vertex_t total_number_of_sources_used = detail::get_total_number_of_sources(handle, k); - if (handle.comms_initialized()) { - rmm::device_vector betweenness(graph.number_of_vertices, 0); - detail::betweenness_centrality_impl(handle, - graph, - betweenness.data().get(), - normalize, - endpoints, - weight, - k, - vertices, - total_number_of_sources_used); - handle.get_comms().reduce(betweenness.data().get(), - result, - betweenness.size(), - raft::comms::op_t::SUM, - 0, - handle.get_stream()); - } else { - detail::betweenness_centrality_impl(handle, - graph, - result, - normalize, - endpoints, - weight, - k, - vertices, - total_number_of_sources_used); - } -} - -template void betweenness_centrality( - const raft::handle_t&, - legacy::GraphCSRView const&, - float*, - bool, - bool, - float const*, - int, - int const*); -template void betweenness_centrality( - const raft::handle_t&, - legacy::GraphCSRView const&, - double*, - bool, - bool, - double const*, - int, - int const*); - -template -void edge_betweenness_centrality(raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - weight_t const* weight, - vertex_t k, - vertex_t const* vertices) -{ - vertex_t total_number_of_sources_used = detail::get_total_number_of_sources(handle, k); - if (handle.comms_initialized()) { - rmm::device_vector betweenness(graph.number_of_edges, 0); - detail::edge_betweenness_centrality_impl(handle, - graph, - betweenness.data().get(), - normalize, - weight, - k, - vertices, - total_number_of_sources_used); - handle.get_comms().reduce(betweenness.data().get(), - result, - betweenness.size(), - raft::comms::op_t::SUM, - 0, - handle.get_stream()); - } else { - detail::edge_betweenness_centrality_impl( - handle, graph, result, normalize, weight, k, vertices, total_number_of_sources_used); - } -} - -template void edge_betweenness_centrality( - const raft::handle_t&, - legacy::GraphCSRView const&, - float*, - bool, - float const*, - int, - int const*); - -template void edge_betweenness_centrality( - raft::handle_t const& handle, - legacy::GraphCSRView const&, - double*, - bool, - double const*, - int, - int const*); -} // namespace cugraph diff --git a/cpp/src/centrality/legacy/betweenness_centrality.cuh b/cpp/src/centrality/legacy/betweenness_centrality.cuh deleted file mode 100644 index 43f095d634f..00000000000 --- a/cpp/src/centrality/legacy/betweenness_centrality.cuh +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Author: Xavier Cadet xcadet@nvidia.com - -#pragma once -#include - -namespace cugraph { -namespace detail { -template -void betweenness_centrality(raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - bool endpoints, - weight_t const* weight, - vertex_t const number_of_sources, - vertex_t const* sources); - -template -void edge_betweenness_centrality(legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - weight_t const* weight, - vertex_t const number_of_sources, - vertex_t const* sources); - -template -void verify_betweenness_centrality_input(result_t* result, - bool is_edge_betweenness, - bool normalize, - bool endpoints, - weight_t const* weights, - vertex_t const number_of_sources, - vertex_t const* sources); - -template -class BC { - public: - virtual ~BC(void) {} - BC(raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - cudaStream_t stream = 0) - : handle_(handle), graph_(graph) - { - setup(); - } - void configure(result_t* betweenness, - bool is_edge_betweenness, - bool normalize, - bool endpoints, - weight_t const* weight, - vertex_t const* sources, - vertex_t const number_of_sources); - - void configure_edge(result_t* betweenness, - bool normalize, - weight_t const* weight, - vertex_t const* sources, - vertex_t const number_of_sources); - void compute(); - void rescale_by_total_sources_used(vertex_t total_number_of_sources_used); - - private: - // --- RAFT handle --- - raft::handle_t const& handle_; - // --- Information concerning the graph --- - const legacy::GraphCSRView& graph_; - // --- These information are extracted on setup --- - vertex_t number_of_vertices_; // Number of vertices in the graph - vertex_t number_of_edges_; // Number of edges in the graph - edge_t const* offsets_ptr_; // Pointer to the offsets - vertex_t const* indices_ptr_; // Pointers to the indices - - // --- Information from configuration --- - bool configured_ = false; // Flag to ensure configuration was called - bool normalized_ = false; // If True normalize the betweenness - bool is_edge_betweenness_ = false; // If True compute edge_betweeness - - // FIXME: For weighted version - weight_t const* edge_weights_ptr_ = nullptr; // Pointer to the weights - bool endpoints_ = false; // If True normalize the betweenness - vertex_t const* sources_ = nullptr; // Subset of vertices to gather information from - vertex_t number_of_sources_; // Number of vertices in sources - - // --- Output ---- - // betweenness is set/read by users - using Vectors - result_t* betweenness_ = nullptr; - - // --- Data required to perform computation ---- - rmm::device_vector distances_vec_; - rmm::device_vector predecessors_vec_; - rmm::device_vector sp_counters_vec_; - rmm::device_vector deltas_vec_; - - vertex_t* distances_ = - nullptr; // array(|V|) stores the distances gathered by the latest SSSP - vertex_t* predecessors_ = - nullptr; // array(|V|) stores the predecessors of the latest SSSP - double* sp_counters_ = - nullptr; // array(|V|) stores the shortest path counter for the latest SSSP - double* deltas_ = nullptr; // array(|V|) stores the dependencies for the latest SSSP - - int max_grid_dim_1D_ = 0; - int max_block_dim_1D_ = 0; - - void setup(); - - void initialize_work_vectors(); - void initialize_pointers_to_vectors(); - void initialize_device_information(); - - void compute_single_source(vertex_t source_vertex); - - void accumulate(vertex_t source_vertex, vertex_t max_depth); - void initialize_dependencies(); - void accumulate_edges(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration); - void accumulate_vertices_with_endpoints(vertex_t source_vertex, - vertex_t max_depth, - dim3 grid_configuration, - dim3 block_configuration); - void accumulate_vertices(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration); - void add_reached_endpoints_to_source_betweenness(vertex_t source_vertex); - void add_vertices_dependencies_to_betweenness(); - - void rescale(); - std::tuple rescale_vertices_betweenness_centrality(result_t rescale_factor, - bool modified); - std::tuple rescale_edges_betweenness_centrality(result_t rescale_factor, - bool modified); - void apply_rescale_factor_to_betweenness(result_t scaling_factor); -}; -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh b/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh deleted file mode 100644 index b0ccb669376..00000000000 --- a/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -namespace cugraph { -namespace detail { -// Dependecy Accumulation: based on McLaughlin and Bader, 2018 -// FIXME: Accumulation kernel mights not scale well, as each thread is handling -// all the edges for each node, an approach similar to the traversal -// bucket (i.e. BFS / SSSP) system might enable speed up. -// Should look into forAllEdge type primitive for different -// load balancing -template -__global__ void edges_accumulation_kernel(result_t* betweenness, - vertex_t number_vertices, - vertex_t const* indices, - edge_t const* offsets, - vertex_t* distances, - double* sp_counters, - double* deltas, - vertex_t depth) -{ - for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices; - thread_idx += gridDim.x * blockDim.x) { - vertex_t vertex = thread_idx; - double vertex_delta = 0; - double vertex_sigma = sp_counters[vertex]; - if (distances[vertex] == depth) { - edge_t first_edge_idx = offsets[vertex]; - edge_t last_edge_idx = offsets[vertex + 1]; - for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) { - vertex_t successor = indices[edge_idx]; - if (distances[successor] == distances[vertex] + 1) { - double factor = (static_cast(1) + deltas[successor]) / sp_counters[successor]; - double coefficient = vertex_sigma * factor; - - vertex_delta += coefficient; - betweenness[edge_idx] += coefficient; - } - } - deltas[vertex] = vertex_delta; - } - } -} - -template -__global__ void endpoints_accumulation_kernel(result_t* betweenness, - vertex_t number_vertices, - vertex_t const* indices, - edge_t const* offsets, - vertex_t* distances, - double* sp_counters, - double* deltas, - vertex_t depth) -{ - for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices; - thread_idx += gridDim.x * blockDim.x) { - vertex_t vertex = thread_idx; - double vertex_delta = 0; - double vertex_sigma = sp_counters[vertex]; - if (distances[vertex] == depth) { - edge_t first_edge_idx = offsets[vertex]; - edge_t last_edge_idx = offsets[vertex + 1]; - for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) { - vertex_t successor = indices[edge_idx]; - if (distances[successor] == distances[vertex] + 1) { - double factor = (static_cast(1) + deltas[successor]) / sp_counters[successor]; - vertex_delta += vertex_sigma * factor; - } - } - betweenness[vertex] += 1; - deltas[vertex] = vertex_delta; - } - } -} -template -__global__ void accumulation_kernel(result_t* betweenness, - vertex_t number_vertices, - vertex_t const* indices, - edge_t const* offsets, - vertex_t* distances, - double* sp_counters, - double* deltas, - vertex_t depth) -{ - for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices; - thread_idx += gridDim.x * blockDim.x) { - vertex_t vertex = thread_idx; - double vertex_delta = 0; - double vertex_sigma = sp_counters[vertex]; - if (distances[vertex] == depth) { - edge_t first_edge_idx = offsets[vertex]; - edge_t last_edge_idx = offsets[vertex + 1]; - for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) { - vertex_t successor = indices[edge_idx]; - if (distances[successor] == distances[vertex] + 1) { - double factor = (static_cast(1) + deltas[successor]) / sp_counters[successor]; - vertex_delta += vertex_sigma * factor; - } - } - deltas[vertex] = vertex_delta; - } - } -} -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/traversal/legacy/bfs.cu b/cpp/src/traversal/legacy/bfs.cu deleted file mode 100644 index a0fb11a98d9..00000000000 --- a/cpp/src/traversal/legacy/bfs.cu +++ /dev/null @@ -1,575 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -#include "bfs.cuh" -#include -#include -#include - -#include - -#include "bfs_kernels.cuh" -#include "traversal_common.cuh" -#include -#include - -namespace cugraph { -namespace detail { -enum BFS_ALGO_STATE { TOPDOWN, BOTTOMUP }; - -template -void BFS::setup() -{ - // Determinism flag, false by default - deterministic = false; - - // Working data - // Each vertex can be in the frontier at most once - // We will update frontier during the execution - // We need the orig to reset frontier, or ALLOC_FREE_TRY - original_frontier.resize(number_of_vertices); - frontier = original_frontier.data().get(); - - // size of bitmaps for vertices - vertices_bmap_size = (number_of_vertices / (8 * sizeof(int)) + 1); - // ith bit of visited_bmap is set <=> ith vertex is visited - - visited_bmap.resize(vertices_bmap_size); - - // ith bit of isolated_bmap is set <=> degree of ith vertex = 0 - isolated_bmap.resize(vertices_bmap_size); - - // vertices_degree[i] = degree of vertex i - vertex_degree.resize(number_of_vertices); - - // We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it - // since those uses are mutually exclusive - buffer_np1_1.resize(number_of_vertices + 1); - buffer_np1_2.resize(number_of_vertices + 1); - - // Using buffers : top down - - // frontier_vertex_degree[i] is the degree of vertex frontier[i] - frontier_vertex_degree = buffer_np1_1.data().get(); - // exclusive sum of frontier_vertex_degree - exclusive_sum_frontier_vertex_degree = buffer_np1_2.data().get(); - - // Using buffers : bottom up - // contains list of unvisited vertices - unvisited_queue = buffer_np1_1.data().get(); - // size of the "last" unvisited queue : size_last_unvisited_queue - // refers to the size of unvisited_queue - // which may not be up to date (the queue may contains vertices that are now - // visited) - - // We may leave vertices unvisited after bottom up main kernels - storing them - // here - left_unvisited_queue = buffer_np1_2.data().get(); - - // We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). - // frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of - // the first edge of the bucket See top down kernels for more details - exclusive_sum_frontier_vertex_buckets_offsets.resize( - ((number_of_edges / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2)); - - // Init device-side counters - // Those counters must be/can be reset at each bfs iteration - // Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the - // current bottleneck - d_counters_pad.resize(4); - - d_new_frontier_cnt = d_counters_pad.data().get(); - d_mu = d_counters_pad.data().get() + 1; - d_unvisited_cnt = d_counters_pad.data().get() + 2; - d_left_unvisited_cnt = d_counters_pad.data().get() + 3; - - // Lets use this int* for the next 3 lines - // Its dereferenced value is not initialized - so we dont care about what we - // put in it - IndexType* d_nisolated = d_new_frontier_cnt; - cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream); - - // Computing isolated_bmap - // Only dependent on graph - not source vertex - done once - traversal::flag_isolated_vertices(number_of_vertices, - isolated_bmap.data().get(), - row_offsets, - vertex_degree.data().get(), - d_nisolated, - stream); - cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - - // We need nisolated to be ready to use - cudaStreamSynchronize(stream); -} - -template -void BFS::configure(IndexType* _distances, - IndexType* _predecessors, - double* _sp_counters, - int* _edge_mask) -{ - distances = _distances; - predecessors = _predecessors; - edge_mask = _edge_mask; - sp_counters = _sp_counters; - - useEdgeMask = (edge_mask != NULL); - computeDistances = (distances != NULL); - computePredecessors = (predecessors != NULL); - - // We need distances to use bottom up - if (directed && !computeDistances) { - distances_vals.resize(number_of_vertices); - distances = distances_vals.data().get(); - } - - // In case the shortest path counters is required, previous_bmap has to be allocated - if (sp_counters) { previous_visited_bmap.resize(vertices_bmap_size); } -} - -template -void BFS::traverse(IndexType source_vertex) -{ - // Init visited_bmap - // If the graph is undirected, we not that - // we will never discover isolated vertices (in degree = out degree = 0) - // we avoid a lot of work by flagging them now - // in g500 graphs they represent ~25% of total vertices - // more than that for wiki and twitter graphs - - if (directed) { - cudaMemsetAsync(visited_bmap.data().get(), 0, vertices_bmap_size * sizeof(int), stream); - } else { - cudaMemcpyAsync(visited_bmap.data().get(), - isolated_bmap.data().get(), - vertices_bmap_size * sizeof(int), - cudaMemcpyDeviceToDevice, - stream); - } - - // If needed, setting all vertices as undiscovered (inf distance) - // We dont use computeDistances here - // if the graph is undirected, we may need distances even if - // computeDistances is false - if (distances) - traversal::fill_vec(distances, number_of_vertices, traversal::vec_t::max, stream); - - // If needed, setting all predecessors to non-existent (-1) - if (computePredecessors) { - cudaMemsetAsync(predecessors, -1, number_of_vertices * sizeof(IndexType), stream); - } - - if (sp_counters) { - cudaMemsetAsync(sp_counters, 0, number_of_vertices * sizeof(double), stream); - double value = 1; - cudaMemcpyAsync(sp_counters + source_vertex, &value, sizeof(double), cudaMemcpyHostToDevice); - } - - // - // Initial frontier - // - - frontier = original_frontier.data().get(); - - if (distances) { cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream); } - - // Setting source_vertex as visited - // There may be bit already set on that bmap (isolated vertices) - if the - // graph is undirected - int current_visited_bmap_source_vert = 0; - - if (!directed) { - cudaMemcpyAsync(¤t_visited_bmap_source_vert, - visited_bmap.data().get() + (source_vertex / INT_SIZE), - sizeof(int), - cudaMemcpyDeviceToHost); - // We need current_visited_bmap_source_vert - cudaStreamSynchronize(stream); - } - - int m = (1 << (source_vertex % INT_SIZE)); - - // In that case, source is isolated, done now - if (!directed && (m & current_visited_bmap_source_vert)) { - // Init distances and predecessors are done, (cf Streamsync in previous if) - return; - } - - m |= current_visited_bmap_source_vert; - - cudaMemcpyAsync(visited_bmap.data().get() + (source_vertex / INT_SIZE), - &m, - sizeof(int), - cudaMemcpyHostToDevice, - stream); - - // Adding source_vertex to init frontier - cudaMemcpyAsync(&frontier[0], &source_vertex, sizeof(IndexType), cudaMemcpyHostToDevice, stream); - - // mf : edges in frontier - // nf : vertices in frontier - // mu : edges undiscovered - // nu : nodes undiscovered - // lvl : current frontier's depth - IndexType mf, nf, mu, nu; - bool growing; - IndexType lvl = 1; - - // Frontier has one vertex - nf = 1; - - // all edges are undiscovered (by def isolated vertices have 0 edges) - mu = number_of_edges; - - // all non isolated vertices are undiscovered (excepted source vertex, which is in frontier) - // That number is wrong if source_vertex is also isolated - but it's not important - nu = number_of_vertices - nisolated - nf; - - // Last frontier was 0, now it is 1 - growing = true; - - IndexType size_last_left_unvisited_queue = number_of_vertices; // we just need value > 0 - IndexType size_last_unvisited_queue = 0; // queue empty - - // Typical pre-top down workflow. set_frontier_degree + exclusive-scan - traversal::set_frontier_degree( - frontier_vertex_degree, frontier, vertex_degree.data().get(), nf, stream); - traversal::exclusive_sum( - frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream); - - cudaMemcpyAsync(&mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - - // We need mf - cudaStreamSynchronize(stream); - - // At first we know we have to use top down - BFS_ALGO_STATE algo_state = TOPDOWN; - - // useDistances : we check if a vertex is a parent using distances in bottom up - distances become - // working data undirected g : need parents to be in children's neighbors - - // In case the shortest path counters need to be computeed, the bottom_up approach cannot be used - // bool can_use_bottom_up = (!sp_counters && !directed && distances); - bool can_use_bottom_up = false; - - while (nf > 0) { - new_frontier = frontier + nf; - IndexType old_nf = nf; - resetDevicePointers(); - - if (can_use_bottom_up) { - // Choosing algo - // Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf - - switch (algo_state) { - case TOPDOWN: - if (mf > mu / alpha) algo_state = BOTTOMUP; - break; - case BOTTOMUP: - if (!growing && nf < number_of_vertices / beta) { - // We need to prepare the switch back to top down - // We couldnt keep track of mu during bottom up - because we dont know what mf is. - // Computing mu here - bfs_kernels::count_unvisited_edges(unvisited_queue, - size_last_unvisited_queue, - visited_bmap.data().get(), - vertex_degree.data().get(), - d_mu, - stream); - - // Typical pre-top down workflow. set_frontier_degree + exclusive-scan - traversal::set_frontier_degree( - frontier_vertex_degree, frontier, vertex_degree.data().get(), nf, stream); - traversal::exclusive_sum( - frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream); - - cudaMemcpyAsync(&mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - - cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - - // We will need mf and mu - cudaStreamSynchronize(stream); - algo_state = TOPDOWN; - } - break; - } - } - - // Executing algo - - switch (algo_state) { - case TOPDOWN: - // This step is only required if sp_counters is not nullptr - if (sp_counters) { - cudaMemcpyAsync(previous_visited_bmap.data().get(), - visited_bmap.data().get(), - vertices_bmap_size * sizeof(int), - cudaMemcpyDeviceToDevice, - stream); - // We need to copy the visited_bmap before doing the traversal - cudaStreamSynchronize(stream); - } - traversal::compute_bucket_offsets( - exclusive_sum_frontier_vertex_degree, - exclusive_sum_frontier_vertex_buckets_offsets.data().get(), - nf, - mf, - stream); - bfs_kernels::frontier_expand(row_offsets, - col_indices, - frontier, - nf, - mf, - lvl, - new_frontier, - d_new_frontier_cnt, - exclusive_sum_frontier_vertex_degree, - exclusive_sum_frontier_vertex_buckets_offsets.data().get(), - previous_visited_bmap.data().get(), - visited_bmap.data().get(), - distances, - predecessors, - sp_counters, - edge_mask, - isolated_bmap.data().get(), - directed, - stream, - deterministic); - - mu -= mf; - - cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - RAFT_CHECK_CUDA(stream); - - // We need nf - cudaStreamSynchronize(stream); - - if (nf) { - // Typical pre-top down workflow. set_frontier_degree + exclusive-scan - traversal::set_frontier_degree( - frontier_vertex_degree, new_frontier, vertex_degree.data().get(), nf, stream); - traversal::exclusive_sum( - frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream); - cudaMemcpyAsync(&mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - - // We need mf - cudaStreamSynchronize(stream); - } - break; - - case BOTTOMUP: - bfs_kernels::fill_unvisited_queue(visited_bmap.data().get(), - vertices_bmap_size, - number_of_vertices, - unvisited_queue, - d_unvisited_cnt, - stream, - deterministic); - - size_last_unvisited_queue = nu; - - bfs_kernels::bottom_up_main(unvisited_queue, - size_last_unvisited_queue, - left_unvisited_queue, - d_left_unvisited_cnt, - visited_bmap.data().get(), - row_offsets, - col_indices, - lvl, - new_frontier, - d_new_frontier_cnt, - distances, - predecessors, - edge_mask, - stream, - deterministic); - - // The number of vertices left unvisited decreases - // If it wasnt necessary last time, it wont be this time - if (size_last_left_unvisited_queue) { - cudaMemcpyAsync(&size_last_left_unvisited_queue, - d_left_unvisited_cnt, - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - RAFT_CHECK_CUDA(stream); - // We need last_left_unvisited_size - cudaStreamSynchronize(stream); - bfs_kernels::bottom_up_large(left_unvisited_queue, - size_last_left_unvisited_queue, - visited_bmap.data().get(), - row_offsets, - col_indices, - lvl, - new_frontier, - d_new_frontier_cnt, - distances, - predecessors, - edge_mask, - stream, - deterministic); - } - cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - RAFT_CHECK_CUDA(stream); - - // We will need nf - cudaStreamSynchronize(stream); - break; - } - - // Updating undiscovered edges count - nu -= nf; - - // Using new frontier - frontier = new_frontier; - growing = (nf > old_nf); - - ++lvl; - } -} - -template -void BFS::resetDevicePointers() -{ - cudaMemsetAsync(d_counters_pad.data().get(), 0, 4 * sizeof(IndexType), stream); -} - -template -void BFS::clean() -{ - // the vectors have a destructor that takes care of cleaning -} - -// Explicit Instantiation -template class BFS; -template class BFS; -template class BFS; - -} // namespace detail - -// NOTE: SP counter increase extremely fast on large graph -// It can easily reach 1e40~1e70 on GAP-road.mtx -template -void bfs(raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - VT* distances, - VT* predecessors, - double* sp_counters, - const VT start_vertex, - bool directed, - bool mg_batch) -{ - static_assert(std::is_integral::value && sizeof(VT) >= sizeof(int32_t), - "Unsupported vertex id data type. Use integral types of size >= sizeof(int32_t)"); - static_assert(std::is_same::value, - "VT and ET should be the same time for the current BFS implementation"); - static_assert(std::is_floating_point::value, - "Unsupported edge weight type. Use floating point types"); // actually, this is - // unnecessary for BFS - if (handle.comms_initialized() && !mg_batch) { - CUGRAPH_FAIL("NO LONGER SUPPORTED"); - } else { - VT number_of_vertices = graph.number_of_vertices; - ET number_of_edges = graph.number_of_edges; - - const VT* indices_ptr = graph.indices; - const ET* offsets_ptr = graph.offsets; - - int alpha = 15; - int beta = 18; - // FIXME: Use VT and ET in the BFS detail - cugraph::detail::BFS bfs( - number_of_vertices, number_of_edges, offsets_ptr, indices_ptr, directed, alpha, beta); - bfs.configure(distances, predecessors, sp_counters, nullptr); - bfs.traverse(start_vertex); - } -} - -// Explicit Instantiation -template void bfs( - raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - uint32_t* distances, - uint32_t* predecessors, - double* sp_counters, - const uint32_t source_vertex, - bool directed, - bool mg_batch); - -// Explicit Instantiation -template void bfs( - raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - uint32_t* distances, - uint32_t* predecessors, - double* sp_counters, - const uint32_t source_vertex, - bool directed, - bool mg_batch); - -// Explicit Instantiation -template void bfs( - raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - int32_t* distances, - int32_t* predecessors, - double* sp_counters, - const int32_t source_vertex, - bool directed, - bool mg_batch); - -// Explicit Instantiation -template void bfs( - raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - int32_t* distances, - int32_t* predecessors, - double* sp_counters, - const int32_t source_vertex, - bool directed, - bool mg_batch); - -// Explicit Instantiation -template void bfs( - raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - int64_t* distances, - int64_t* predecessors, - double* sp_counters, - const int64_t source_vertex, - bool directed, - bool mg_batch); - -// Explicit Instantiation -template void bfs( - raft::handle_t const& handle, - legacy::GraphCSRView const& graph, - int64_t* distances, - int64_t* predecessors, - double* sp_counters, - const int64_t source_vertex, - bool directed, - bool mg_batch); - -} // namespace cugraph diff --git a/cpp/src/traversal/legacy/bfs.cuh b/cpp/src/traversal/legacy/bfs.cuh deleted file mode 100644 index dd636a2c97c..00000000000 --- a/cpp/src/traversal/legacy/bfs.cuh +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -#pragma once - -#include -#include - -#define TRAVERSAL_DEFAULT_ALPHA 15 - -#define TRAVERSAL_DEFAULT_BETA 18 - -namespace cugraph { -namespace detail { -// FIXME: Differentiate IndexType for vertices and edges -template -class BFS { - private: - IndexType number_of_vertices, number_of_edges; - const IndexType* row_offsets = nullptr; - const IndexType* col_indices = nullptr; - - bool directed; - bool deterministic; - - // edgemask, distances, predecessors are set/read by users - using Vectors - bool useEdgeMask; - bool computeDistances; - bool computePredecessors; - rmm::device_vector distances_vals; - IndexType* distances = nullptr; - IndexType* predecessors = nullptr; - double* sp_counters = nullptr; - int* edge_mask = nullptr; - - rmm::device_vector original_frontier; - rmm::device_vector visited_bmap; - rmm::device_vector isolated_bmap; - rmm::device_vector previous_visited_bmap; - rmm::device_vector vertex_degree; - rmm::device_vector buffer_np1_1; - rmm::device_vector buffer_np1_2; - rmm::device_vector exclusive_sum_frontier_vertex_buckets_offsets; - rmm::device_vector d_counters_pad; - // Working data - // For complete description of each, go to bfs.cu - IndexType nisolated; - IndexType* frontier = nullptr; - IndexType* new_frontier = nullptr; - IndexType* frontier_vertex_degree = nullptr; - IndexType* exclusive_sum_frontier_vertex_degree = nullptr; - IndexType* unvisited_queue = nullptr; - IndexType* left_unvisited_queue = nullptr; - IndexType* d_new_frontier_cnt = nullptr; - IndexType* d_mu = nullptr; - IndexType* d_unvisited_cnt = nullptr; - IndexType* d_left_unvisited_cnt = nullptr; - - IndexType vertices_bmap_size; - - // Parameters for direction optimizing - IndexType alpha, beta; - cudaStream_t stream; - - // resets pointers defined by d_counters_pad (see implem) - void resetDevicePointers(); - void setup(); - void clean(); - - public: - virtual ~BFS(void) { clean(); } - - BFS(IndexType _number_of_vertices, - IndexType _number_of_edges, - const IndexType* _row_offsets, - const IndexType* _col_indices, - bool _directed, - IndexType _alpha, - IndexType _beta, - cudaStream_t _stream = 0) - : number_of_vertices(_number_of_vertices), - number_of_edges(_number_of_edges), - row_offsets(_row_offsets), - col_indices(_col_indices), - directed(_directed), - alpha(_alpha), - beta(_beta), - stream(_stream) - { - setup(); - } - - void configure(IndexType* distances, - IndexType* predecessors, - double* sp_counters, - int* edge_mask); - - void traverse(IndexType source_vertex); -}; -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/traversal/legacy/bfs_kernels.cuh b/cpp/src/traversal/legacy/bfs_kernels.cuh deleted file mode 100644 index a0c49e9601a..00000000000 --- a/cpp/src/traversal/legacy/bfs_kernels.cuh +++ /dev/null @@ -1,1163 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -#include -#include - -#include "traversal_common.cuh" -#include - -namespace cugraph { -namespace detail { -namespace bfs_kernels { -// -// ------------------------- Bottom up ------------------------- -// - -// -// fill_unvisited_queue_kernel -// -// Finding unvisited vertices in the visited_bmap, and putting them in the queue -// Vertices represented by the same int in the bitmap are adjacent in the queue, -// and sorted For instance, the queue can look like this : 34 38 45 58 61 4 18 -// 24 29 71 84 85 90 Because they are represented by those ints in the bitmap : -// [34 38 45 58 61] [4 18 24 29] [71 84 85 90] - -// visited_bmap_nints = the visited_bmap is made of that number of ints - -template -__global__ void fill_unvisited_queue_kernel(int* visited_bmap, - IndexType visited_bmap_nints, - IndexType n, - IndexType* unvisited, - IndexType* unvisited_cnt) -{ - typedef cub::BlockScan BlockScan; - __shared__ typename BlockScan::TempStorage scan_temp_storage; - - // When filling the "unvisited" queue, we use "unvisited_cnt" to know where to - // write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) ) - // We will actually do only one atomicAdd per block - we first do a scan, then - // call one atomicAdd, and store the common offset for the block in - // unvisited_common_block_offset - __shared__ IndexType unvisited_common_block_offset; - - // We don't want threads divergence in the loop (we're going to call - // __syncthreads) Using a block-only dependent in the condition of the loop - for (IndexType block_v_idx = blockIdx.x * blockDim.x; block_v_idx < visited_bmap_nints; - block_v_idx += blockDim.x * gridDim.x) { - // Index of visited_bmap that this thread will compute - IndexType v_idx = block_v_idx + threadIdx.x; - - int thread_visited_int = (v_idx < visited_bmap_nints) - ? visited_bmap[v_idx] - : (~0); // will be neutral in the next lines - // (virtual vertices all visited) - - // The last int can only be partially valid - // If we are indeed taking care of the last visited int in this thread, - // We need to first disable (ie set as "visited") the inactive bits - // (vertices >= n) - if (v_idx == (visited_bmap_nints - 1)) { - int active_bits = n - (INT_SIZE * v_idx); - int inactive_bits = INT_SIZE - active_bits; - int mask = traversal::getMaskNLeftmostBitSet(inactive_bits); - thread_visited_int |= mask; // Setting inactive bits as visited - } - - // Counting number of unvisited vertices represented by this int - int n_unvisited_in_int = __popc(~thread_visited_int); - int unvisited_thread_offset; - - // We will need to write n_unvisited_in_int unvisited vertices to the - // unvisited queue We ask for that space when computing the block scan, that - // will tell where to write those vertices in the queue, using the common - // offset of the block (see below) - BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset); - - // Last thread knows how many vertices will be written to the queue by this - // block Asking for that space in the queue using the global count, and - // saving the common offset - if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) { - IndexType total = unvisited_thread_offset + n_unvisited_in_int; - unvisited_common_block_offset = traversal::atomicAdd(unvisited_cnt, total); - } - - // syncthreads for two reasons : - // - we need to broadcast unvisited_common_block_offset - // - we will reuse scan_temp_storage (cf CUB doc) - __syncthreads(); - - IndexType current_unvisited_index = unvisited_common_block_offset + unvisited_thread_offset; - int nvertices_to_write = n_unvisited_in_int; - - // getNextZeroBit uses __ffs, which gives least significant bit set - // which means that as long as n_unvisited_in_int is valid, - // we will use valid bits - - while (nvertices_to_write > 0) { - if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) { - typename traversal::vec_t::vec4 vec_v; - - vec_v.x = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int); - vec_v.y = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int); - vec_v.z = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int); - vec_v.w = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int); - - typename traversal::vec_t::vec4* unvisited_i4 = - reinterpret_cast::vec4*>( - &unvisited[current_unvisited_index]); - *unvisited_i4 = vec_v; - - current_unvisited_index += 4; - nvertices_to_write -= 4; - } else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) { - typename traversal::vec_t::vec2 vec_v; - - vec_v.x = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int); - vec_v.y = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int); - - typename traversal::vec_t::vec2* unvisited_i2 = - reinterpret_cast::vec2*>( - &unvisited[current_unvisited_index]); - *unvisited_i2 = vec_v; - - current_unvisited_index += 2; - nvertices_to_write -= 2; - } else { - IndexType v = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int); - - unvisited[current_unvisited_index] = v; - - current_unvisited_index += 1; - nvertices_to_write -= 1; - } - } - } -} - -// Wrapper -template -void fill_unvisited_queue(int* visited_bmap, - IndexType visited_bmap_nints, - IndexType n, - IndexType* unvisited, - IndexType* unvisited_cnt, - cudaStream_t m_stream, - bool deterministic) -{ - dim3 grid, block; - block.x = FILL_UNVISITED_QUEUE_DIMX; - - grid.x = std::min(static_cast(MAXBLOCKS), - (static_cast(visited_bmap_nints) + block.x - 1) / block.x); - - fill_unvisited_queue_kernel<<>>( - visited_bmap, visited_bmap_nints, n, unvisited, unvisited_cnt); - RAFT_CHECK_CUDA(m_stream); -} - -// -// count_unvisited_edges_kernel -// Couting the total number of unvisited edges in the graph - using an -// potentially unvisited queue We need the current unvisited vertices to be in -// the unvisited queue But visited vertices can be in the potentially_unvisited -// queue We first check if the vertex is still unvisited before using it Useful -// when switching from "Bottom up" to "Top down" -// - -template -__global__ void count_unvisited_edges_kernel(const IndexType* potentially_unvisited, - const IndexType potentially_unvisited_size, - const int* visited_bmap, - IndexType* degree_vertices, - IndexType* mu) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage reduce_temp_storage; - - // number of undiscovered edges counted by this thread - IndexType thread_unvisited_edges_count = 0; - - for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; idx < potentially_unvisited_size; - idx += blockDim.x * gridDim.x) { - IndexType u = potentially_unvisited[idx]; - int u_visited_bmap = visited_bmap[u / INT_SIZE]; - int is_visited = u_visited_bmap & (1 << (u % INT_SIZE)); - - if (!is_visited) thread_unvisited_edges_count += degree_vertices[u]; - } - - // We need all thread_unvisited_edges_count to be ready before reducing - __syncthreads(); - - IndexType block_unvisited_edges_count = - BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count); - - // block_unvisited_edges_count is only defined is th.x == 0 - if (threadIdx.x == 0) traversal::atomicAdd(mu, block_unvisited_edges_count); -} - -// Wrapper -template -void count_unvisited_edges(const IndexType* potentially_unvisited, - const IndexType potentially_unvisited_size, - const int* visited_bmap, - IndexType* node_degree, - IndexType* mu, - cudaStream_t m_stream) -{ - dim3 grid, block; - block.x = COUNT_UNVISITED_EDGES_DIMX; - grid.x = std::min(static_cast(MAXBLOCKS), - (static_cast(potentially_unvisited_size) + block.x - 1) / block.x); - - count_unvisited_edges_kernel<<>>( - potentially_unvisited, potentially_unvisited_size, visited_bmap, node_degree, mu); - RAFT_CHECK_CUDA(m_stream); -} - -// -// Main Bottom Up kernel -// Here we will start to process unvisited vertices in the unvisited queue -// We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges -// If it's not possible to define a valid parent using only those edges, -// add it to the "left_unvisited_queue" -// - -// -// We will use the "vertices represented by the same int in the visited bmap are -// adjacents and sorted in the unvisited queue" property It is used to do a -// reduction locally and fully build the new visited_bmap -// - -template -__global__ void main_bottomup_kernel(const IndexType* unvisited, - const IndexType unvisited_size, - IndexType* left_unvisited, - IndexType* left_unvisited_cnt, - int* visited_bmap, - const IndexType* row_ptr, - const IndexType* col_ind, - IndexType lvl, - IndexType* new_frontier, - IndexType* new_frontier_cnt, - IndexType* distances, - IndexType* predecessors, - int* edge_mask) -{ - typedef cub::BlockDiscontinuity BlockDiscontinuity; - typedef cub::WarpReduce WarpReduce; - typedef cub::BlockScan BlockScan; - - __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage; - __shared__ typename WarpReduce::TempStorage reduce_temp_storage; - __shared__ typename BlockScan::TempStorage scan_temp_storage; - - // To write vertices in the frontier, - // We will use a block scan to locally compute the offsets - // frontier_common_block_offset contains the common offset for the block - __shared__ IndexType frontier_common_block_offset; - - // When building the new visited_bmap, we reduce (using a bitwise and) the - // visited_bmap ints from the vertices represented by the same int (for - // instance vertices 1, 5, 9, 13, 23) vertices represented by the same int - // will be designed as part of the same "group" To detect the deliminations - // between those groups, we use BlockDiscontinuity Then we need to create the - // new "visited_bmap" within those group. We use a warp reduction that takes - // into account limits between groups to do it But a group can be cut in two - // different warps : in that case, the second warp put the result of its local - // reduction in local_visited_bmap_warp_head the first warp will then read it - // and finish the reduction - - __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS]; - - const int warpid = threadIdx.x / WARP_SIZE; - const int laneid = threadIdx.x % WARP_SIZE; - - // When this kernel is converted to support different VT and ET, this - // will likely split into invalid_vid and invalid_eid - // This is equivalent to ~IndexType(0) (i.e., all bits set to 1) - constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx::value; - - // we will call __syncthreads inside the loop - // we need to keep complete block active - for (IndexType block_off = blockIdx.x * blockDim.x; block_off < unvisited_size; - block_off += blockDim.x * gridDim.x) { - IndexType idx = block_off + threadIdx.x; - - // This thread will take care of unvisited_vertex - // in the visited_bmap, it is represented by the int at index - // visited_bmap_index = unvisited_vertex/INT_SIZE - // it will be used by BlockDiscontinuity - // to flag the separation between groups of vertices (vertices represented - // by different in in visited_bmap) - IndexType visited_bmap_index[1]; // this is an array of size 1 because CUB - // needs one - - visited_bmap_index[0] = invalid_idx; - IndexType unvisited_vertex = invalid_idx; - - // local_visited_bmap gives info on the visited bit of unvisited_vertex - // - // By default, everything is visited - // This is because we only take care of unvisited vertices here, - // The other are by default unvisited - // If a vertex remain unvisited, we will notice it here - // That's why by default we consider everything visited ( ie ~0 ) - // If we fail to assign one parent to an unvisited vertex, we will - // explicitly unset the bit - int local_visited_bmap = (~0); - int found = 0; - int more_to_visit = 0; - IndexType valid_parent; - IndexType left_unvisited_off; - - if (idx < unvisited_size) { - // Processing first STPV edges of unvisited v - // If bigger than that, push to left_unvisited queue - unvisited_vertex = unvisited[idx]; - - IndexType edge_begin = row_ptr[unvisited_vertex]; - IndexType edge_end = row_ptr[unvisited_vertex + 1]; - - visited_bmap_index[0] = unvisited_vertex / INT_SIZE; - - IndexType degree = edge_end - edge_begin; - - for (IndexType edge = edge_begin; - edge < min(static_cast(edge_end), - static_cast(edge_begin) + MAIN_BOTTOMUP_MAX_EDGES); - ++edge) { - if (edge_mask && !edge_mask[edge]) continue; - - IndexType parent_candidate = col_ind[edge]; - - if (distances[parent_candidate] == (lvl - 1)) { - found = 1; - valid_parent = parent_candidate; - break; - } - } - - // This vertex will remain unvisited at the end of this kernel - // Explicitly say it - if (!found) - local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); // let this one unvisited - else { - if (distances) distances[unvisited_vertex] = lvl; - if (predecessors) predecessors[unvisited_vertex] = valid_parent; - } - - // If we haven't found a parent and there's more edge to check - if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) { - left_unvisited_off = traversal::atomicAdd(left_unvisited_cnt, static_cast(1)); - more_to_visit = 1; - } - } - - // - // We will separate vertices in group - // Two vertices are in the same group if represented by same int in - // visited_bmap ie u and v in same group <=> u/32 == v/32 - // - // We will now flag the head of those group (first element of each group) - // - // 1) All vertices within the same group are adjacent in the queue (cf - // fill_unvisited_queue) 2) A group is of size <= 32, so a warp will contain - // at least one head, and a group will be contained at most by two warps - - int is_head_a[1]; // CUB need an array - BlockDiscontinuity(discontinuity_temp_storage) - .FlagHeads(is_head_a, visited_bmap_index, cub::Inequality()); - int is_head = is_head_a[0]; - - // Computing the warp reduce within group - // This primitive uses the is_head flags to know where the limits of the - // groups are We use bitwise and as operator, because of the fact that 1 is - // the default value If a vertex is unvisited, we have to explicitly ask for - // it - int local_bmap_agg = - WarpReduce(reduce_temp_storage) - .HeadSegmentedReduce(local_visited_bmap, is_head, traversal::BitwiseAnd()); - - // We need to take care of the groups cut in two in two different warps - // Saving second part of the reduce here, then applying it on the first part - // bellow Corner case : if the first thread of the warp is a head, then this - // group is not cut in two and then we have to be neutral (for an bitwise - // and, it's an ~0) - if (laneid == 0) { local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg; } - - // broadcasting local_visited_bmap_warp_head - __syncthreads(); - - int head_ballot = __ballot_sync(raft::warp_full_mask(), is_head); - - // As long as idx < unvisited_size, we know there's at least one head per - // warp - int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot); - - int is_last_head_in_warp = (laneid == laneid_last_head_in_warp); - - // if laneid == 0 && is_last_head_in_warp, it's a special case where - // a group of size 32 starts exactly at lane 0 - // in that case, nothing to do (this group is not cut by a warp - // delimitation) we also have to make sure that a warp actually exists after - // this one (this corner case is handled after) - if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)) { - local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1]; - } - - // Three cases : - // -> This is the first group of the block - it may be cut in two (with - // previous block) - // -> This is the last group of the block - same thing - // -> This group is completely contained in this block - - if (warpid == 0 && laneid == 0) { - // The first elt of this group considered in this block is - // unvisited_vertex We know that's the case because elts are sorted in a - // group, and we are at laneid == 0 We will do an atomicOr - we have to be - // neutral about elts < unvisited_vertex - int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid - int mask = traversal::getMaskNLeftmostBitSet(INT_SIZE - iv); - local_bmap_agg &= mask; // we have to be neutral for elts < unvisited_vertex - atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); - } else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) && - laneid >= laneid_last_head_in_warp && // We need the other ones - // to go in else case - idx < unvisited_size // we could be out - ) { - // Last head of the block - // We don't know if this group is complete - - // last_v is the last unvisited_vertex of the group IN THIS block - // we dont know about the rest - we have to be neutral about elts > last_v - - // the destination thread of the __shfl is active - int laneid_max = - min(static_cast(WARP_SIZE - 1), (unvisited_size - (block_off + 32 * warpid))); - IndexType last_v = __shfl_sync(__activemask(), unvisited_vertex, laneid_max, WARP_SIZE); - - if (is_last_head_in_warp) { - int ilast_v = last_v % INT_SIZE + 1; - int mask = traversal::getMaskNRightmostBitSet(ilast_v); - local_bmap_agg &= mask; // we have to be neutral for elts > last_unvisited_vertex - atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); - } - } else { - // group completely in block - if (is_head && idx < unvisited_size) { - visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; // no atomics needed, we know - // everything about this int - } - } - - // Saving in frontier - - int thread_frontier_offset; - BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset); - IndexType inclusive_sum = thread_frontier_offset + found; - if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) { - frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum); - } - - // 1) Broadcasting frontier_common_block_offset - // 2) we want to reuse the *_temp_storage - __syncthreads(); - - if (found) - new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex; - if (more_to_visit) left_unvisited[left_unvisited_off] = unvisited_vertex; - } -} - -template -void bottom_up_main(IndexType* unvisited, - IndexType unvisited_size, - IndexType* left_unvisited, - IndexType* d_left_unvisited_idx, - int* visited, - const IndexType* row_ptr, - const IndexType* col_ind, - IndexType lvl, - IndexType* new_frontier, - IndexType* new_frontier_idx, - IndexType* distances, - IndexType* predecessors, - int* edge_mask, - cudaStream_t m_stream, - bool deterministic) -{ - dim3 grid, block; - block.x = MAIN_BOTTOMUP_DIMX; - - grid.x = std::min(static_cast(MAXBLOCKS), - (static_cast(unvisited_size) + block.x) / block.x); - - main_bottomup_kernel<<>>(unvisited, - unvisited_size, - left_unvisited, - d_left_unvisited_idx, - visited, - row_ptr, - col_ind, - lvl, - new_frontier, - new_frontier_idx, - distances, - predecessors, - edge_mask); - RAFT_CHECK_CUDA(m_stream); -} - -// -// bottom_up_large_degree_kernel -// finishing the work started in main_bottomup_kernel for vertex with degree > -// MAIN_BOTTOMUP_MAX_EDGES && no parent found -// -template -__global__ void bottom_up_large_degree_kernel(IndexType* left_unvisited, - IndexType left_unvisited_size, - int* visited, - const IndexType* row_ptr, - const IndexType* col_ind, - IndexType lvl, - IndexType* new_frontier, - IndexType* new_frontier_cnt, - IndexType* distances, - IndexType* predecessors, - int* edge_mask) -{ - int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE; - int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE; - int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE; - - // When this kernel is converted to support different VT and ET, this - // will likely split into invalid_vid and invalid_eid - // This is equivalent to ~IndexType(0) (i.e., all bits set to 1) - constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx::value; - - // Inactive threads are not a pb for __ballot (known behaviour) - for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id; - idx < left_unvisited_size; - idx += gridDim.x * logical_warps_per_block) { - // Unvisited vertices - potentially in the next frontier - IndexType v = left_unvisited[idx]; - - // Used only with symmetric graphs - // Parents are included in v's neighbors - IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; // we already have checked the - // first MAIN_BOTTOMUP_MAX_EDGES - // edges in find_unvisited - - IndexType end_i_edge = row_ptr[v + 1]; - - // We can have warp divergence in the next loop - // It's not a pb because the behaviour of __ballot - // is know with inactive threads - for (IndexType i_edge = first_i_edge + logical_lane_id; i_edge < end_i_edge; - i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) { - IndexType valid_parent = invalid_idx; - - if (!edge_mask || edge_mask[i_edge]) { - IndexType u = col_ind[i_edge]; - IndexType lvl_u = distances[u]; - - if (lvl_u == (lvl - 1)) { valid_parent = u; } - } - - unsigned int warp_valid_p_ballot = - __ballot_sync(raft::warp_full_mask(), valid_parent != invalid_idx); - - int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE; - unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1; - unsigned int logical_warp_valid_p_ballot = - warp_valid_p_ballot >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp); - logical_warp_valid_p_ballot &= mask; - - int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1; - - if (chosen_thread == logical_lane_id) { - // Using only one valid parent (reduce bw) - IndexType off = traversal::atomicAdd(new_frontier_cnt, static_cast(1)); - int m = 1 << (v % INT_SIZE); - atomicOr(&visited[v / INT_SIZE], m); - distances[v] = lvl; - - if (predecessors) predecessors[v] = valid_parent; - - new_frontier[off] = v; - } - - if (logical_warp_valid_p_ballot) { break; } - } - } -} - -template -void bottom_up_large(IndexType* left_unvisited, - IndexType left_unvisited_size, - int* visited, - const IndexType* row_ptr, - const IndexType* col_ind, - IndexType lvl, - IndexType* new_frontier, - IndexType* new_frontier_idx, - IndexType* distances, - IndexType* predecessors, - int* edge_mask, - cudaStream_t m_stream, - bool deterministic) -{ - dim3 grid, block; - block.x = LARGE_BOTTOMUP_DIMX; - grid.x = std::min( - static_cast(MAXBLOCKS), - ((static_cast(left_unvisited_size) + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / - block.x); - - bottom_up_large_degree_kernel<<>>(left_unvisited, - left_unvisited_size, - visited, - row_ptr, - col_ind, - lvl, - new_frontier, - new_frontier_idx, - distances, - predecessors, - edge_mask); - RAFT_CHECK_CUDA(m_stream); -} - -// -// topdown_expand_kernel -// Read current frontier and compute new one with top down paradigm -// One thread = One edge -// To know origin of edge, we have to find where is index_edge in the values of -// frontier_degrees_exclusive_sum (using a binary search, max less or equal -// than) This index k will give us the origin of this edge, which is frontier[k] -// This thread will then process the (linear_idx_thread - -// frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k] -// -// To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load -// NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches -// We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we -// prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD -// * blockDim.x edges -// -// Once we have those offsets, we may still need a few values from -// frontier_degrees_exclusive_sum to compute exact index k To be able to do it, -// we will load the values that we need from frontier_degrees_exclusive_sum in -// shared memory We know that it will fit because we never add node with degree -// == 0 in the frontier, so we have an upper bound on the number of value to -// load (see below) -// -// We will then look which vertices are not visited yet : -// 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as -// visited, update distances and predecessors, and move on 2) if the unvisited -// vertex has degree > 0, we add it to the "frontier_candidates" queue -// -// We then treat the candidates queue using the threadIdx.x < ncandidates -// If we are indeed the first thread to discover that vertex (result of -// atomicOr(visited)) We add it to the new frontier -// - -template -__global__ void topdown_expand_kernel( - const IndexType* row_ptr, - const IndexType* col_ind, - const IndexType* frontier, - const IndexType frontier_size, - const IndexType totaldegree, - const IndexType max_items_per_thread, - const IndexType lvl, - IndexType* new_frontier, - IndexType* new_frontier_cnt, - const IndexType* frontier_degrees_exclusive_sum, - const IndexType* frontier_degrees_exclusive_sum_buckets_offsets, - int* previous_bmap, - int* bmap, - IndexType* distances, - IndexType* predecessors, - double* sp_counters, - const int* edge_mask, - const int* isolated_bmap, - bool directed) -{ - // BlockScan - typedef cub::BlockScan BlockScan; - __shared__ typename BlockScan::TempStorage scan_storage; - - // We will do a scan to know where to write in frontier - // This will contain the common offset of the block - __shared__ IndexType frontier_common_block_offset; - - __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1]; - __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1]; - - // - // Frontier candidates local queue - // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able - // to store everything We also save the predecessors here, because we will not - // be able to retrieve it after - // - __shared__ IndexType - shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE * TOP_DOWN_EXPAND_DIMX]; - __shared__ IndexType - shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE * TOP_DOWN_EXPAND_DIMX]; - __shared__ IndexType block_n_frontier_candidates; - - IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread; - - // When this kernel is converted to support different VT and ET, this - // will likely split into invalid_vid and invalid_eid - // This is equivalent to ~IndexType(0) (i.e., all bits set to 1) - constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx::value; - - IndexType n_items_per_thread_left = - (totaldegree > block_offset) - ? (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) / TOP_DOWN_EXPAND_DIMX - : 0; - - n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left); - - for (; (n_items_per_thread_left > 0) && (block_offset < totaldegree); - - block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x, - n_items_per_thread_left -= min( - n_items_per_thread_left, static_cast(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD))) { - // In this loop, we will process batch_set_size batches - IndexType nitems_per_thread = - min(n_items_per_thread_left, static_cast(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD)); - - // Loading buckets offset (see compute_bucket_offsets_kernel) - - if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1)) - shared_buckets_offsets[threadIdx.x] = - frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE + - threadIdx.x]; - - // We will use shared_buckets_offsets - __syncthreads(); - - // - // shared_buckets_offsets gives us a range of the possible indexes - // for edge of linear_threadx, we are looking for the value k such as - // k is the max value such as frontier_degrees_exclusive_sum[k] <= - // linear_threadx - // - // we have 0 <= k < frontier_size - // but we also have : - // - // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE] - // <= k - // <= - // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE - // + 1] - // - // To find the exact value in that range, we need a few values from - // frontier_degrees_exclusive_sum (see below) We will load them here We will - // load as much as we can - if it doesn't fit we will make multiple - // iteration of the next loop Because all vertices in frontier have degree > - // 0, we know it will fits if left + 1 = right (see below) - - // We're going to load values in frontier_degrees_exclusive_sum for batch - // [left; right[ If it doesn't fit, --right until it does, then loop It is - // excepted to fit on the first try, that's why we start right = - // nitems_per_thread - - IndexType left = 0; - IndexType right = nitems_per_thread; - - while (left < nitems_per_thread) { - // - // Values that are necessary to compute the local binary searches - // We only need those with indexes between extremes indexes of - // buckets_offsets We need the next val for the binary search, hence the - // +1 - // - - IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] - - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; - - // If left = right + 1 we are sure to have nvalues_to_load < - // TOP_DOWN_EXPAND_DIMX+1 - while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) { - --right; - - nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] - - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; - } - - IndexType nitems_per_thread_for_this_load = right - left; - - IndexType frontier_degrees_exclusive_sum_block_offset = - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK]; - - if (threadIdx.x < nvalues_to_load) { - shared_frontier_degrees_exclusive_sum[threadIdx.x] = - frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + threadIdx.x]; - } - - if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) { - shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] = - frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + - TOP_DOWN_EXPAND_DIMX]; - } - - // shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, - // sync - __syncthreads(); - - // Now we will process the edges - // Here each thread will process nitems_per_thread_for_this_load - for (IndexType item_index = 0; item_index < nitems_per_thread_for_this_load; - item_index += TOP_DOWN_BATCH_SIZE) { - // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction - // parallism) Reduces latency - - IndexType current_max_edge_index = min( - static_cast(block_offset) + (left + nitems_per_thread_for_this_load) * blockDim.x, - static_cast(totaldegree)); - - // We will need vec_u (source of the edge) until the end if we need to - // save the predecessors For others informations, we will reuse pointers - // on the go (nvcc does not color well the registers in that case) - - IndexType vec_u[TOP_DOWN_BATCH_SIZE]; - IndexType local_buf1[TOP_DOWN_BATCH_SIZE]; - IndexType local_buf2[TOP_DOWN_BATCH_SIZE]; - - IndexType* vec_frontier_degrees_exclusive_sum_index = &local_buf2[0]; - -#pragma unroll - for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType ibatch = left + item_index + iv; - IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x; - - if (gid < current_max_edge_index) { - IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x) / TOP_DOWN_BUCKET_SIZE; - IndexType bucket_start = - shared_buckets_offsets[start_off_idx] - frontier_degrees_exclusive_sum_block_offset; - IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1] - - frontier_degrees_exclusive_sum_block_offset; - - IndexType k = traversal::binsearch_maxle( - shared_frontier_degrees_exclusive_sum, gid, bucket_start, bucket_end) + - frontier_degrees_exclusive_sum_block_offset; - vec_u[iv] = frontier[k]; // origin of this edge - vec_frontier_degrees_exclusive_sum_index[iv] = frontier_degrees_exclusive_sum[k]; - } else { - vec_u[iv] = invalid_idx; - vec_frontier_degrees_exclusive_sum_index[iv] = invalid_idx; - } - } - - IndexType* vec_row_ptr_u = &local_buf1[0]; -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType u = vec_u[iv]; - // row_ptr for this vertex origin u - vec_row_ptr_u[iv] = (u != invalid_idx) ? row_ptr[u] : invalid_idx; - } - - // We won't need row_ptr after that, reusing pointer - IndexType* vec_dest_v = vec_row_ptr_u; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType thread_item_index = left + item_index + iv; - IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x; - - IndexType row_ptr_u = vec_row_ptr_u[iv]; - // Need this check so that we don't use invalid values of edge to index - if (row_ptr_u != invalid_idx) { - IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv]; - - if (edge_mask && !edge_mask[edge]) { - // Disabling edge - row_ptr_u = invalid_idx; - } else { - // Destination of this edge - vec_dest_v[iv] = col_ind[edge]; - } - } - } - - // We don't need vec_frontier_degrees_exclusive_sum_index anymore - IndexType* vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index; - - // Visited bmap need to contain information about the previous - // frontier if we actually process every edge (shortest path counting) - // otherwise we can read and update from the same bmap -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_dest_v[iv]; - vec_v_visited_bmap[iv] = - (v != invalid_idx) ? previous_bmap[v / INT_SIZE] : (~int(0)); // will look visited - } - - // From now on we will consider v as a frontier candidate - // If for some reason vec_candidate[iv] should be put in the - // new_frontier Then set vec_candidate[iv] = -1 - IndexType* vec_frontier_candidate = vec_dest_v; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - int m = 1 << (v % INT_SIZE); - - int is_visited = vec_v_visited_bmap[iv] & m; - - if (is_visited) vec_frontier_candidate[iv] = invalid_idx; - } - - // Each source should update the destination shortest path counter - // if the destination has not been visited in the *previous* frontier - if (sp_counters) { -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType dst = vec_frontier_candidate[iv]; - if (dst != invalid_idx) { - IndexType src = vec_u[iv]; - atomicAdd(&sp_counters[dst], sp_counters[src]); - } - } - } - - if (directed) { - // vec_v_visited_bmap is available - IndexType* vec_is_isolated_bmap = vec_v_visited_bmap; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - vec_is_isolated_bmap[iv] = (v != invalid_idx) ? isolated_bmap[v / INT_SIZE] : ~int(0); - } - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - int m = 1 << (v % INT_SIZE); - int is_isolated = vec_is_isolated_bmap[iv] & m; - - // If v is isolated, we will not add it to the frontier (it's not a - // frontier candidate) 1st reason : it's useless 2nd reason : it - // will make top down algo fail we need each node in frontier to - // have a degree > 0 If it is isolated, we just need to mark it as - // visited, and save distance and predecessor here. Not need to - // check return value of atomicOr - - if (is_isolated && v != invalid_idx) { - int m = 1 << (v % INT_SIZE); - atomicOr(&bmap[v / INT_SIZE], m); - if (distances) distances[v] = lvl; - - if (predecessors) predecessors[v] = vec_u[iv]; - - // This is no longer a candidate, neutralize it - vec_frontier_candidate[iv] = invalid_idx; - } - } - } - - // Number of successor candidate hold by this thread - IndexType thread_n_frontier_candidates = 0; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - if (v != invalid_idx) ++thread_n_frontier_candidates; - } - - // We need to have all nfrontier_candidates to be ready before doing the - // scan - __syncthreads(); - - // We will put the frontier candidates in a local queue - // Computing offsets - IndexType thread_frontier_candidate_offset = 0; // offset inside block - BlockScan(scan_storage) - .ExclusiveSum(thread_n_frontier_candidates, thread_frontier_candidate_offset); - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - // May have bank conflicts - IndexType frontier_candidate = vec_frontier_candidate[iv]; - - if (frontier_candidate != invalid_idx) { - shared_local_new_frontier_candidates[thread_frontier_candidate_offset] = - frontier_candidate; - shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = vec_u[iv]; - ++thread_frontier_candidate_offset; - } - } - - if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { - // No need to add nsuccessor_candidate, even if its an - // exclusive sum - // We incremented the thread_frontier_candidate_offset - block_n_frontier_candidates = thread_frontier_candidate_offset; - } - - // broadcast block_n_frontier_candidates - __syncthreads(); - - IndexType naccepted_vertices = 0; - // We won't need vec_frontier_candidate after that - IndexType* vec_frontier_accepted_vertex = vec_frontier_candidate; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - const int idx_shared = iv * blockDim.x + threadIdx.x; - vec_frontier_accepted_vertex[iv] = invalid_idx; - - if (idx_shared < block_n_frontier_candidates) { - IndexType v = shared_local_new_frontier_candidates[idx_shared]; // popping - // queue - int m = 1 << (v % INT_SIZE); - int q = atomicOr(&bmap[v / INT_SIZE], m); // atomicOr returns old - - if (!(m & q)) { // if this thread was the first to discover this node - if (distances) distances[v] = lvl; - - if (predecessors) { - IndexType pred = shared_local_new_frontier_predecessors[idx_shared]; - predecessors[v] = pred; - } - - vec_frontier_accepted_vertex[iv] = v; - ++naccepted_vertices; - } - } - } - - // We need naccepted_vertices to be ready - __syncthreads(); - - IndexType thread_new_frontier_offset; - - BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset); - - if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { - IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices; - // for this thread, thread_new_frontier_offset + has_successor - // (exclusive sum) - if (inclusive_sum) - frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum); - } - - // Broadcasting frontier_common_block_offset - __syncthreads(); - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - const int idx_shared = iv * blockDim.x + threadIdx.x; - if (idx_shared < block_n_frontier_candidates) { - IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv]; - - if (new_frontier_vertex != invalid_idx) { - IndexType off = frontier_common_block_offset + thread_new_frontier_offset++; - new_frontier[off] = new_frontier_vertex; - } - } - } - } - - // We need to keep shared_frontier_degrees_exclusive_sum coherent - __syncthreads(); - - // Preparing for next load - left = right; - right = nitems_per_thread; - } - - // we need to keep shared_buckets_offsets coherent - __syncthreads(); - } -} - -template -void frontier_expand(const IndexType* row_ptr, - const IndexType* col_ind, - const IndexType* frontier, - const IndexType frontier_size, - const IndexType totaldegree, - const IndexType lvl, - IndexType* new_frontier, - IndexType* new_frontier_cnt, - const IndexType* frontier_degrees_exclusive_sum, - const IndexType* frontier_degrees_exclusive_sum_buckets_offsets, - int* previous_visited_bmap, - int* visited_bmap, - IndexType* distances, - IndexType* predecessors, - double* sp_counters, - const int* edge_mask, - const int* isolated_bmap, - bool directed, - cudaStream_t m_stream, - bool deterministic) -{ - if (!totaldegree) return; - - dim3 block; - block.x = TOP_DOWN_EXPAND_DIMX; - - IndexType max_items_per_thread = - (static_cast(totaldegree) + MAXBLOCKS * block.x - 1) / (MAXBLOCKS * block.x); - - dim3 grid; - grid.x = std::min((static_cast(totaldegree) + max_items_per_thread * block.x - 1) / - (max_items_per_thread * block.x), - static_cast(MAXBLOCKS)); - - // Shortest Path counting (Betweenness Centrality) - // We need to keep track of the previously visited bmap - - // If the coutner of shortest path is nullptr - // The previous_visited_bmap is no longer needed (and should be nullptr on - // the first access), so it can be the same as the current visitedbmap - if (!sp_counters) { previous_visited_bmap = visited_bmap; } - topdown_expand_kernel<<>>( - row_ptr, - col_ind, - frontier, - frontier_size, - totaldegree, - max_items_per_thread, - lvl, - new_frontier, - new_frontier_cnt, - frontier_degrees_exclusive_sum, - frontier_degrees_exclusive_sum_buckets_offsets, - previous_visited_bmap, - visited_bmap, - distances, - predecessors, - sp_counters, - edge_mask, - isolated_bmap, - directed); - RAFT_CHECK_CUDA(m_stream); -} - -} // namespace bfs_kernels -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/traversal/legacy/traversal_common.cuh b/cpp/src/traversal/legacy/traversal_common.cuh deleted file mode 100644 index fac80e90eb6..00000000000 --- a/cpp/src/traversal/legacy/traversal_common.cuh +++ /dev/null @@ -1,480 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#define MAXBLOCKS 65535 -#define WARP_SIZE 32 -#define INT_SIZE 32 - -// -// Bottom up macros -// - -#define FILL_UNVISITED_QUEUE_DIMX 256 - -#define COUNT_UNVISITED_EDGES_DIMX 256 - -#define MAIN_BOTTOMUP_DIMX 256 -#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX / WARP_SIZE) - -#define LARGE_BOTTOMUP_DIMX 256 - -// Number of edges processed in the main bottom up kernel -#define MAIN_BOTTOMUP_MAX_EDGES 6 - -// Power of 2 < 32 (strict <) -#define BOTTOM_UP_LOGICAL_WARP_SIZE 4 - -// -// Top down macros -// - -// We will precompute the results the binsearch_maxle every -// TOP_DOWN_BUCKET_SIZE edges -#define TOP_DOWN_BUCKET_SIZE 32 - -// DimX of the kernel -#define TOP_DOWN_EXPAND_DIMX 256 - -// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets -#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX / TOP_DOWN_BUCKET_SIZE) - -// How many items_per_thread we can process with one bucket_offset loading -// the -1 is here because we need the +1 offset -#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1) - -// instruction parallelism -// for how many edges will we create instruction parallelism -#define TOP_DOWN_BATCH_SIZE 2 - -#define COMPUTE_BUCKET_OFFSETS_DIMX 512 - -// Other macros - -#define FLAG_ISOLATED_VERTICES_DIMX 128 - -// Number of vertices handled by one thread -// Must be power of 2, lower than 32 -#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4 - -// Number of threads involved in the "construction" of one int in the bitset -#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT \ - (INT_SIZE / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD) - -// -// Parameters of the heuristic to switch between bottomup/topdown -// Finite machine described in -// http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf -// - -namespace cugraph { -namespace detail { -namespace traversal { - -// -// gives the equivalent vectors from a type -// for the max val, would be better to use numeric_limits<>::max() once -// cpp11 is allowed in nvgraph -// - -template -struct vec_t { - typedef int4 vec4; - typedef int2 vec2; -}; - -template <> -struct vec_t { - typedef int4 vec4; - typedef int2 vec2; - static const int max = std::numeric_limits::max(); -}; - -template <> -struct vec_t { - typedef long4 vec4; - typedef long2 vec2; - static const long max = std::numeric_limits::max(); -}; - -template <> -struct vec_t { - typedef uint4 vec4; - typedef uint2 vec2; - static const unsigned max = std::numeric_limits::max(); -}; - -template <> -struct vec_t { - typedef longlong4 vec4; - typedef longlong2 vec2; - static const long long int max = std::numeric_limits::max(); -}; - -template <> -struct vec_t { - typedef float4 vec4; - typedef float2 vec2; - static constexpr float max = std::numeric_limits::max(); -}; - -template <> -struct vec_t { - typedef double4 vec4; - typedef double2 vec2; - static constexpr double max = std::numeric_limits::max(); -}; - -// -// ------------------------- Helper device functions ------------------- -// - -__forceinline__ __device__ int getMaskNRightmostBitSet(int n) -{ - if (n == INT_SIZE) return (~0); - int mask = (1 << n) - 1; - return mask; -} - -__forceinline__ __device__ int getMaskNLeftmostBitSet(int n) -{ - if (n == 0) return 0; - int mask = ~((1 << (INT_SIZE - n)) - 1); - return mask; -} - -__forceinline__ __device__ int getNextZeroBit(int& val) -{ - int ibit = __ffs(~val) - 1; - val |= (1 << ibit); - - return ibit; -} - -struct BitwiseAnd { - template - __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const - { - return (a & b); - } -}; -struct BitwiseOr { - template - __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const - { - return (a | b); - } -}; - -template -__global__ void fill_vec_kernel(ValueType* vec, SizeType n, ValueType val) -{ - for (SizeType idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) - vec[idx] = val; -} - -template -void fill_vec(ValueType* vec, SizeType n, ValueType val, cudaStream_t stream) -{ - dim3 grid, block; - block.x = 256; - grid.x = (n + block.x - 1) / block.x; - - fill_vec_kernel<<>>(vec, n, val); - RAFT_CHECK_CUDA(stream); -} - -template -__device__ IndexType -binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexType high) -{ - while (true) { - if (low == high) return low; // we know it exists - if ((low + 1) == high) return (vec[high] <= val) ? high : low; - - IndexType mid = low + (high - low) / 2; - - if (vec[mid] > val) - high = mid - 1; - else - low = mid; - } -} - -// FIXME: The atomicAdd wrappers should be moved to RAFT - -template -__device__ static __forceinline__ T atomicAdd(T* addr, T val) -{ - return ::atomicAdd(addr, val); -} - -template <> -__device__ __forceinline__ int64_t atomicAdd(int64_t* addr, int64_t val) -{ - static_assert(sizeof(int64_t) == sizeof(unsigned long long), - "sizeof(int64_t) != sizeof(unsigned long long). Can't use atomicAdd"); - - return ::atomicAdd(reinterpret_cast(addr), - static_cast(val)); -} - -__device__ static __forceinline__ float atomicMin(float* addr, float val) -{ - int* addr_as_int = (int*)addr; - int old = *addr_as_int; - int expected; - do { - expected = old; - old = - ::atomicCAS(addr_as_int, expected, __float_as_int(::fminf(val, __int_as_float(expected)))); - } while (expected != old); - return __int_as_float(old); -} - -__device__ static __forceinline__ double atomicMin(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - - do { - assumed = old; - old = ::atomicCAS( - address_as_ull, assumed, __double_as_longlong(::fmin(val, __longlong_as_double(assumed)))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != - // NaN) - } while (assumed != old); - - return __longlong_as_double(old); -} - -template -__global__ void flag_isolated_vertices_kernel(IndexType n, - int* isolated_bmap, - const IndexType* row_ptr, - IndexType* degrees, - IndexType* nisolated) -{ - typedef cub::BlockLoad - BlockLoad; - typedef cub::BlockStore - BlockStore; - typedef cub::BlockReduce BlockReduce; - typedef cub::WarpReduce WarpReduce; - - __shared__ typename BlockLoad::TempStorage load_temp_storage; - __shared__ typename BlockStore::TempStorage store_temp_storage; - __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage; - - __shared__ typename WarpReduce::TempStorage - warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX / FLAG_ISOLATED_VERTICES_THREADS_PER_INT]; - - __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX]; - - for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * blockIdx.x); - block_off < n; - block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) { - IndexType thread_off = block_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x; - IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1; - - IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; - IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1] - - BlockLoad(load_temp_storage).Load(row_ptr + block_off, thread_row_ptr, block_valid_items, -1); - - // To compute 4 degrees, we need 5 values of row_ptr - // Saving the "5th" value in shared memory for previous thread to use - if (threadIdx.x > 0) { row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; } - - // If this is the last thread, it needs to load its row ptr tail value - if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) { - row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1]; - } - __syncthreads(); // we may reuse temp_storage - - int local_isolated_bmap = 0; - - IndexType imax = (n > thread_off) ? (n - thread_off) : 0; - - IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; - -#pragma unroll - for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) { - IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i]; - - if (i < imax) local_isolated_bmap |= ((degree == 0) << i); - } - - if (last_node_thread < n) { - IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] = - row_ptr_tail[threadIdx.x] - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1]; - - local_isolated_bmap |= ((degree == 0) << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1)); - } - - local_isolated_bmap <<= (thread_off % INT_SIZE); - - IndexType local_nisolated = __popc(local_isolated_bmap); - - // We need local_nisolated and local_isolated_bmap to be ready for next - // steps - __syncthreads(); - - IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated); - - if (threadIdx.x == 0 && total_nisolated) { traversal::atomicAdd(nisolated, total_nisolated); } - - int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT; - - // Building int for bmap - int int_aggregate_isolated_bmap = - WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(local_isolated_bmap, BitwiseOr()); - - int is_head_of_visited_int = ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0); - if (is_head_of_visited_int && (thread_off / INT_SIZE) < (n + INT_SIZE - 1) / INT_SIZE) { - isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap; - } - - BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items - 1); - } -} - -template -void flag_isolated_vertices(IndexType n, - int* isolated_bmap, - const IndexType* row_ptr, - IndexType* degrees, - IndexType* nisolated, - cudaStream_t m_stream) -{ - dim3 grid, block; - block.x = FLAG_ISOLATED_VERTICES_DIMX; - - grid.x = min((IndexType)MAXBLOCKS, - (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x); - - flag_isolated_vertices_kernel<<>>( - n, isolated_bmap, row_ptr, degrees, nisolated); - RAFT_CHECK_CUDA(m_stream); -} - -template -__global__ void set_frontier_degree_kernel(IndexType* frontier_degree, - IndexType* frontier, - const IndexType* degree, - IndexType n) -{ - for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; - idx += gridDim.x * blockDim.x) { - IndexType u = frontier[idx]; - frontier_degree[idx] = degree[u]; - } -} - -template -void set_frontier_degree(IndexType* frontier_degree, - IndexType* frontier, - const IndexType* degree, - IndexType n, - cudaStream_t m_stream) -{ - dim3 grid, block; - block.x = 256; - grid.x = min((n + block.x - 1) / block.x, (IndexType)MAXBLOCKS); - set_frontier_degree_kernel<<>>(frontier_degree, frontier, degree, n); - RAFT_CHECK_CUDA(m_stream); -} - -template -void exclusive_sum(void* d_temp_storage, - size_t temp_storage_bytes, - IndexType* d_in, - IndexType* d_out, - IndexType num_items, - cudaStream_t m_stream) -{ - if (num_items <= 1) return; // DeviceScan fails if n==1 - cub::DeviceScan::ExclusiveSum( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, m_stream); -} - -template -void exclusive_sum(IndexType* d_in, IndexType* d_out, IndexType num_items, cudaStream_t m_stream) -{ - if (num_items <= 1) return; // DeviceScan fails if n==1 - thrust::exclusive_scan(rmm::exec_policy(m_stream), d_in, d_in + num_items, d_out); -} - -// -// compute_bucket_offsets_kernel -// simply compute the position in the frontier corresponding all valid edges -// with index=TOP_DOWN_BUCKET_SIZE * k, k integer -// - -template -__global__ void compute_bucket_offsets_kernel(const IndexType* frontier_degrees_exclusive_sum, - IndexType* bucket_offsets, - const IndexType frontier_size, - IndexType total_degree) -{ - IndexType end = - ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX * NBUCKETS_PER_BLOCK + 1); - - for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x; bid <= end; - bid += gridDim.x * blockDim.x) { - IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1); - - bucket_offsets[bid] = - binsearch_maxle(frontier_degrees_exclusive_sum, eid, (IndexType)0, frontier_size - 1); - } -} - -template -void compute_bucket_offsets(IndexType* cumul, - IndexType* bucket_offsets, - IndexType frontier_size, - IndexType total_degree, - cudaStream_t m_stream) -{ - dim3 grid, block; - block.x = COMPUTE_BUCKET_OFFSETS_DIMX; - - grid.x = - min((IndexType)MAXBLOCKS, - ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX * NBUCKETS_PER_BLOCK + 1 + - block.x - 1) / - block.x); - - compute_bucket_offsets_kernel<<>>( - cumul, bucket_offsets, frontier_size, total_degree); - RAFT_CHECK_CUDA(m_stream); -} -} // namespace traversal -} // namespace detail -} // namespace cugraph diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index da1e0e50919..eebd31a0030 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -239,15 +239,6 @@ ConfigureTest(GRAPH_GENERATORS_TEST generators/generators_test.cpp) # - erdos renyi graph generator tests ------------------------------------------------------------- ConfigureTest(ERDOS_RENYI_GENERATOR_TEST generators/erdos_renyi_test.cpp) -################################################################################################### -# - betweenness centrality tests ------------------------------------------------------------------ -ConfigureTest(LEGACY_BETWEENNESS_TEST centrality/legacy/betweenness_centrality_test.cu) -ConfigureTest(LEGACY_EDGE_BETWEENNESS_TEST centrality/legacy/edge_betweenness_centrality_test.cu) - -################################################################################################### -# - BFS tests ------------------------------------------------------------------------------------- -ConfigureTest(LEGACY_BFS_TEST traversal/legacy/bfs_test.cu) - ################################################################################################### # - LOUVAIN tests --------------------------------------------------------------------------------- ConfigureTest(LOUVAIN_TEST community/louvain_test.cpp) diff --git a/cpp/tests/centrality/legacy/betweenness_centrality_test.cu b/cpp/tests/centrality/legacy/betweenness_centrality_test.cu deleted file mode 100644 index 2cdbe1c98e4..00000000000 --- a/cpp/tests/centrality/legacy/betweenness_centrality_test.cu +++ /dev/null @@ -1,450 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifndef TEST_EPSILON -#define TEST_EPSILON 0.0001 -#endif - -// NOTE: Defines under which values the difference should be discarded when -// considering values are close to zero -// i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is -// significant -#ifndef TEST_ZERO_THRESHOLD -#define TEST_ZERO_THRESHOLD 1e-10 -#endif - -// ============================================================================ -// C++ Reference Implementation -// ============================================================================ -template -void ref_accumulation(result_t* result, - vertex_t const number_of_vertices, - std::stack& S, - std::vector>& pred, - std::vector& sigmas, - std::vector& deltas, - vertex_t source) -{ - for (vertex_t v = 0; v < number_of_vertices; ++v) { - deltas[v] = 0; - } - while (!S.empty()) { - vertex_t w = S.top(); - S.pop(); - for (vertex_t v : pred[w]) { - deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); - } - if (w != source) { result[w] += deltas[w]; } - } -} - -template -void ref_endpoints_accumulation(result_t* result, - vertex_t const number_of_vertices, - std::stack& S, - std::vector>& pred, - std::vector& sigmas, - std::vector& deltas, - vertex_t source) -{ - result[source] += S.size() - 1; - for (vertex_t v = 0; v < number_of_vertices; ++v) { - deltas[v] = 0; - } - while (!S.empty()) { - vertex_t w = S.top(); - S.pop(); - for (vertex_t v : pred[w]) { - deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); - } - if (w != source) { result[w] += deltas[w] + 1; } - } -} - -template -void ref_edge_accumulation(result_t* result, - vertex_t const number_of_vertices, - std::stack& S, - std::vector>& pred, - std::vector& sigmas, - std::vector& deltas, - vertex_t source) -{ - for (vertex_t v = 0; v < number_of_vertices; ++v) { - deltas[v] = 0; - } - while (!S.empty()) { - vertex_t w = S.top(); - S.pop(); - for (vertex_t v : pred[w]) { - deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); - } - if (w != source) { result[w] += deltas[w]; } - } -} - -// Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001) -template -void reference_betweenness_centrality_impl(vertex_t* indices, - edge_t* offsets, - vertex_t const number_of_vertices, - result_t* result, - bool endpoints, - vertex_t const* sources, - vertex_t const number_of_sources) -{ - std::queue Q; - std::stack S; - // NOTE: dist is of type vertex_t not weight_t - std::vector dist(number_of_vertices); - std::vector> pred(number_of_vertices); - std::vector sigmas(number_of_vertices); - std::vector deltas(number_of_vertices); - - std::vector neighbors; - - if (sources) { - for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) { - vertex_t s = sources[source_idx]; - // Step 1: Single-source shortest-paths problem - // a. Initialization - ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); - // Step 2: Accumulation - // Back propagation of dependencies - if (endpoints) { - ref_endpoints_accumulation( - result, number_of_vertices, S, pred, sigmas, deltas, s); - } else { - ref_accumulation( - result, number_of_vertices, S, pred, sigmas, deltas, s); - } - } - } else { - for (vertex_t s = 0; s < number_of_vertices; ++s) { - // Step 1: Single-source shortest-paths problem - // a. Initialization - ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); - // Step 2: Accumulation - // Back propagation of dependencies - if (endpoints) { - ref_endpoints_accumulation( - result, number_of_vertices, S, pred, sigmas, deltas, s); - } else { - ref_accumulation( - result, number_of_vertices, S, pred, sigmas, deltas, s); - } - } - } -} - -template -void reference_rescale(result_t* result, - bool directed, - bool normalize, - bool endpoints, - vertex_t const number_of_vertices, - vertex_t const number_of_sources) -{ - bool modified = false; - result_t rescale_factor = static_cast(1); - result_t casted_number_of_sources = static_cast(number_of_sources); - result_t casted_number_of_vertices = static_cast(number_of_vertices); - if (normalize) { - if (number_of_vertices > 2) { - if (endpoints) { - rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1)); - } else { - rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2)); - } - modified = true; - } - } else { - if (!directed) { - rescale_factor /= static_cast(2); - modified = true; - } - } - if (modified) { - if (number_of_sources > 0) { - rescale_factor *= (casted_number_of_vertices / casted_number_of_sources); - } - } - for (auto idx = 0; idx < number_of_vertices; ++idx) { - result[idx] *= rescale_factor; - } -} - -template -void reference_betweenness_centrality( - cugraph::legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - bool endpoints, // This is not yet implemented - vertex_t const number_of_sources, - vertex_t const* sources) -{ - vertex_t number_of_vertices = graph.number_of_vertices; - edge_t number_of_edges = graph.number_of_edges; - thrust::host_vector h_indices(number_of_edges); - thrust::host_vector h_offsets(number_of_vertices + 1); - - thrust::device_ptr d_indices((vertex_t*)&graph.indices[0]); - thrust::device_ptr d_offsets((edge_t*)&graph.offsets[0]); - - thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin()); - thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin()); - - cudaDeviceSynchronize(); - - reference_betweenness_centrality_impl(&h_indices[0], - &h_offsets[0], - number_of_vertices, - result, - endpoints, - sources, - number_of_sources); - reference_rescale( - result, graph.prop.directed, normalize, endpoints, number_of_vertices, number_of_sources); -} -// Explicit instantiation -/* FIXME!!! -template void reference_betweenness_centrality( - cugraph::legacy::GraphCSRView const &, - float *, - bool, - bool, - const int, - int const *); -template void reference_betweenness_centrality( - cugraph::legacy::GraphCSRView const &, - double *, - bool, - bool, - const int, - int const *); -*/ - -// ============================================================================= -// Utility functions -// ============================================================================= -// Compare while allowing relatie error of epsilon -// zero_threshold indicates when we should drop comparison for small numbers -template -bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold) -{ - return ((zero_threshold > a && zero_threshold > b)) || - (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon)); -} - -// ============================================================================= -// Test Suite -// ============================================================================= -// Defines Betweenness Centrality UseCase -// SSSP's test suite code uses type of Graph parameter that could be used -// (MTX / RMAT) -typedef struct BC_Usecase_t { - std::string config_; // Path to graph file - std::string file_path_; // Complete path to graph using dataset_root_dir - int number_of_sources_; // Starting point from the traversal - BC_Usecase_t(const std::string& config, int number_of_sources) - : config_(config), number_of_sources_(number_of_sources) - { - // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR - // FIXME: Use platform independent stuff from c++14/17 on compiler update - const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); - if ((config_ != "") && (config_[0] != '/')) { - file_path_ = rapidsDatasetRootDir + "/" + config_; - } else { - file_path_ = config_; - } - }; -} BC_Usecase; - -class Tests_BC : public ::testing::TestWithParam { - raft::handle_t handle; - - public: - Tests_BC() {} - - static void SetUpTestCase() {} - static void TearDownTestCase() {} - - virtual void SetUp() {} - virtual void TearDown() {} - - // vertex_t vertex identifier data type - // edge_t edge identifier data type - // weight_t edge weight data type - // result_t result data type - // normalize should the result be normalized - // endpoints should the endpoints be included - template - void run_current_test(const BC_Usecase& configuration) - { - // Step 1: Construction of the graph based on configuration - bool is_directed = false; - auto csr = cugraph::test::generate_graph_csr_from_mm( - is_directed, configuration.file_path_); - cudaDeviceSynchronize(); - cugraph::legacy::GraphCSRView G = csr->view(); - G.prop.directed = is_directed; - RAFT_CUDA_TRY(cudaGetLastError()); - std::vector result(G.number_of_vertices, 0); - std::vector expected(G.number_of_vertices, 0); - - // Step 2: Generation of sources based on configuration - // if number_of_sources_ is 0 then sources must be nullptr - // Otherwise we only use the first k values - ASSERT_TRUE(configuration.number_of_sources_ >= 0 && - configuration.number_of_sources_ <= G.number_of_vertices) - << "Number number of sources should be >= 0 and" - << " less than the number of vertices in the graph"; - std::vector sources(configuration.number_of_sources_); - thrust::sequence(thrust::host, sources.begin(), sources.end(), 0); - - vertex_t* sources_ptr = nullptr; - if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } - - reference_betweenness_centrality( - G, expected.data(), normalize, endpoints, configuration.number_of_sources_, sources_ptr); - - sources_ptr = nullptr; - if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } - - rmm::device_vector d_result(G.number_of_vertices); - cugraph::betweenness_centrality(handle, - G, - d_result.data().get(), - normalize, - endpoints, - static_cast(nullptr), - configuration.number_of_sources_, - sources_ptr); - cudaDeviceSynchronize(); - RAFT_CUDA_TRY(cudaMemcpy(result.data(), - d_result.data().get(), - sizeof(result_t) * G.number_of_vertices, - cudaMemcpyDeviceToHost)); - cudaDeviceSynchronize(); - for (int i = 0; i < G.number_of_vertices; ++i) - EXPECT_TRUE(compare_close(result[i], expected[i], TEST_EPSILON, TEST_ZERO_THRESHOLD)) - << "[MISMATCH] vaid = " << i << ", cugraph = " << result[i] - << " expected = " << expected[i]; - } -}; - -// ============================================================================ -// Tests -// ============================================================================ -// Verifiy Un-Normalized results -TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_NO_ENDPOINTS) -{ - run_current_test(GetParam()); -} - -#if 0 -// Temporarily disable some of the test combinations -// Full solution will be explored for issue #1555 -TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_NO_ENDPOINTS) -{ - run_current_test(GetParam()); -} - -TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_ENDPOINTS) -{ - run_current_test(GetParam()); -} -#endif - -TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_ENDPOINTS) -{ - run_current_test(GetParam()); -} - -// Verifiy Normalized results -TEST_P(Tests_BC, CheckFP32_NORMALIZE_NO_ENDPOINTS) -{ - run_current_test(GetParam()); -} - -#if 0 -// Temporarily disable some of the test combinations -// Full solution will be explored for issue #1555 -TEST_P(Tests_BC, CheckFP64_NORMALIZE_NO_ENDPOINTS) -{ - run_current_test(GetParam()); -} - -TEST_P(Tests_BC, CheckFP32_NORMALIZE_ENDPOINTS) -{ - run_current_test(GetParam()); -} -#endif - -TEST_P(Tests_BC, CheckFP64_NORMALIZE_ENDPOINTS) -{ - run_current_test(GetParam()); -} - -#if 0 -// Temporarily disable some of the test combinations -// Full solution will be explored for issue #1555 -INSTANTIATE_TEST_SUITE_P(simple_test, - Tests_BC, - ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0), - BC_Usecase("test/datasets/netscience.mtx", 0), - BC_Usecase("test/datasets/netscience.mtx", 4), - BC_Usecase("test/datasets/wiki2003.mtx", 4), - BC_Usecase("test/datasets/wiki-Talk.mtx", 4))); -#else -INSTANTIATE_TEST_SUITE_P(simple_test, - Tests_BC, - ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0), - BC_Usecase("test/datasets/netscience.mtx", 0), - BC_Usecase("test/datasets/netscience.mtx", 4))); -#endif - -CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu b/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu deleted file mode 100644 index 153993deda7..00000000000 --- a/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -#ifndef TEST_EPSILON -#define TEST_EPSILON 0.0001 -#endif - -// NOTE: Defines under which values the difference should be discarded when -// considering values are close to zero -// i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is -// significant -#ifndef TEST_ZERO_THRESHOLD -#define TEST_ZERO_THRESHOLD 1e-10 -#endif - -// ============================================================================ -// C++ Reference Implementation -// ============================================================================ - -template -edge_t get_edge_index_from_source_and_destination(vertex_t source_vertex, - vertex_t destination_vertex, - vertex_t const* indices, - edge_t const* offsets) -{ - edge_t index = -1; - edge_t first_edge_idx = offsets[source_vertex]; - edge_t last_edge_idx = offsets[source_vertex + 1]; - auto index_it = std::find(indices + first_edge_idx, indices + last_edge_idx, destination_vertex); - if (index_it != (indices + last_edge_idx)) { index = std::distance(indices, index_it); } - return index; -} - -template -void ref_accumulation(result_t* result, - vertex_t const* indices, - edge_t const* offsets, - vertex_t const number_of_vertices, - std::stack& S, - std::vector>& pred, - std::vector& sigmas, - std::vector& deltas, - vertex_t source) -{ - for (vertex_t v = 0; v < number_of_vertices; ++v) { - deltas[v] = 0; - } - while (!S.empty()) { - vertex_t w = S.top(); - S.pop(); - for (vertex_t v : pred[w]) { - edge_t edge_idx = - get_edge_index_from_source_and_destination( - v, w, indices, offsets); - double coefficient = (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); - - deltas[v] += coefficient; - result[edge_idx] += coefficient; - } - } -} - -// Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001) -template -void reference_edge_betweenness_centrality_impl(vertex_t* indices, - edge_t* offsets, - vertex_t const number_of_vertices, - result_t* result, - vertex_t const* sources, - vertex_t const number_of_sources) -{ - std::queue Q; - std::stack S; - // NOTE: dist is of type vertex_t not weight_t - std::vector dist(number_of_vertices); - std::vector> pred(number_of_vertices); - std::vector sigmas(number_of_vertices); - std::vector deltas(number_of_vertices); - - std::vector neighbors; - - if (sources) { - for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) { - vertex_t s = sources[source_idx]; - // Step 1: Single-source shortest-paths problem - // a. Initialization - ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); - // Step 2: Accumulation - // Back propagation of dependencies - ref_accumulation( - result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s); - } - } else { - for (vertex_t s = 0; s < number_of_vertices; ++s) { - // Step 1: Single-source shortest-paths problem - // a. Initialization - ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); - // Step 2: Accumulation - // Back propagation of dependencies - ref_accumulation( - result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s); - } - } -} - -template -void reference_rescale(result_t* result, - bool directed, - bool normalize, - vertex_t const number_of_vertices, - edge_t const number_of_edges) -{ - result_t rescale_factor = static_cast(1); - result_t casted_number_of_vertices = static_cast(number_of_vertices); - if (normalize) { - if (number_of_vertices > 1) { - rescale_factor /= ((casted_number_of_vertices) * (casted_number_of_vertices - 1)); - } - } else { - if (!directed) { rescale_factor /= static_cast(2); } - } - for (auto idx = 0; idx < number_of_edges; ++idx) { - result[idx] *= rescale_factor; - } -} - -template -void reference_edge_betweenness_centrality( - cugraph::legacy::GraphCSRView const& graph, - result_t* result, - bool normalize, - vertex_t const number_of_sources, - vertex_t const* sources) -{ - vertex_t number_of_vertices = graph.number_of_vertices; - edge_t number_of_edges = graph.number_of_edges; - thrust::host_vector h_indices(number_of_edges); - thrust::host_vector h_offsets(number_of_vertices + 1); - - thrust::device_ptr d_indices((vertex_t*)&graph.indices[0]); - thrust::device_ptr d_offsets((edge_t*)&graph.offsets[0]); - - thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin()); - thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin()); - - cudaDeviceSynchronize(); - - reference_edge_betweenness_centrality_impl( - &h_indices[0], &h_offsets[0], number_of_vertices, result, sources, number_of_sources); - reference_rescale( - result, graph.prop.directed, normalize, number_of_vertices, number_of_edges); -} - -// ============================================================================= -// Utility functions -// ============================================================================= -// Compare while allowing relatie error of epsilon -// zero_threshold indicates when we should drop comparison for small numbers -template -bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold) -{ - return ((zero_threshold > a && zero_threshold > b)) || - (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon)); -} - -// ============================================================================= -// Test Suite -// ============================================================================= -// Defines Betweenness Centrality UseCase -// SSSP's test suite code uses type of Graph parameter that could be used -// (MTX / RMAT) -typedef struct EdgeBC_Usecase_t { - std::string config_; // Path to graph file - std::string file_path_; // Complete path to graph using dataset_root_dir - int number_of_sources_; // Starting point from the traversal - EdgeBC_Usecase_t(const std::string& config, int number_of_sources) - : config_(config), number_of_sources_(number_of_sources) - { - // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR - // FIXME: Use platform independent stuff from c++14/17 on compiler update - const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); - if ((config_ != "") && (config_[0] != '/')) { - file_path_ = rapidsDatasetRootDir + "/" + config_; - } else { - file_path_ = config_; - } - }; -} EdgeBC_Usecase; - -class Tests_EdgeBC : public ::testing::TestWithParam { - raft::handle_t handle; - - public: - Tests_EdgeBC() {} - - static void SetUpTestCase() {} - static void TearDownTestCase() {} - - virtual void SetUp() {} - virtual void TearDown() {} - - // FIXME: Should normalize be part of the configuration instead? - // vertex_t vertex identifier data type - // edge_t edge identifier data type - // weight_t edge weight data type - // result_t result data type - // normalize should the result be normalized - template - void run_current_test(const EdgeBC_Usecase& configuration) - { - // Step 1: Construction of the graph based on configuration - bool is_directed = false; - auto csr = cugraph::test::generate_graph_csr_from_mm( - is_directed, configuration.file_path_); - cudaDeviceSynchronize(); - cugraph::legacy::GraphCSRView G = csr->view(); - G.prop.directed = is_directed; - RAFT_CUDA_TRY(cudaGetLastError()); - std::vector result(G.number_of_edges, 0); - std::vector expected(G.number_of_edges, 0); - - // Step 2: Generation of sources based on configuration - // if number_of_sources_ is 0 then sources must be nullptr - // Otherwise we only use the first k values - ASSERT_TRUE(configuration.number_of_sources_ >= 0 && - configuration.number_of_sources_ <= G.number_of_vertices) - << "Number number of sources should be >= 0 and" - << " less than the number of vertices in the graph"; - std::vector sources(configuration.number_of_sources_); - thrust::sequence(thrust::host, sources.begin(), sources.end(), 0); - - vertex_t* sources_ptr = nullptr; - if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } - - reference_edge_betweenness_centrality( - G, expected.data(), normalize, configuration.number_of_sources_, sources_ptr); - - sources_ptr = nullptr; - if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } - - rmm::device_vector d_result(G.number_of_edges); - cugraph::edge_betweenness_centrality(handle, - G, - d_result.data().get(), - normalize, - static_cast(nullptr), - configuration.number_of_sources_, - sources_ptr); - RAFT_CUDA_TRY(cudaMemcpy(result.data(), - d_result.data().get(), - sizeof(result_t) * G.number_of_edges, - cudaMemcpyDeviceToHost)); - for (int i = 0; i < G.number_of_edges; ++i) - EXPECT_TRUE(compare_close(result[i], expected[i], TEST_EPSILON, TEST_ZERO_THRESHOLD)) - << "[MISMATCH] vaid = " << i << ", cugraph = " << result[i] - << " expected = " << expected[i]; - } -}; - -// ============================================================================ -// Tests -// ============================================================================ -// Verifiy Un-Normalized results -TEST_P(Tests_EdgeBC, CheckFP32_NO_NORMALIZE) -{ - run_current_test(GetParam()); -} - -#if 0 -// Temporarily disable some of the test combinations -// Full solution will be explored for issue #1555 -TEST_P(Tests_EdgeBC, CheckFP64_NO_NORMALIZE) -{ - run_current_test(GetParam()); -} - -// Verifiy Normalized results -TEST_P(Tests_EdgeBC, CheckFP32_NORMALIZE) -{ - run_current_test(GetParam()); -} -#endif - -TEST_P(Tests_EdgeBC, CheckFP64_NORMALIZE) -{ - run_current_test(GetParam()); -} - -#if 0 -// Temporarily disable some of the test combinations -// Full solution will be explored for issue #1555 -INSTANTIATE_TEST_SUITE_P(simple_test, - Tests_EdgeBC, - ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0), - EdgeBC_Usecase("test/datasets/netscience.mtx", 0), - EdgeBC_Usecase("test/datasets/netscience.mtx", 4), - EdgeBC_Usecase("test/datasets/wiki2003.mtx", 4), - EdgeBC_Usecase("test/datasets/wiki-Talk.mtx", 4))); -#else -INSTANTIATE_TEST_SUITE_P(simple_test, - Tests_EdgeBC, - ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0), - EdgeBC_Usecase("test/datasets/netscience.mtx", 0), - EdgeBC_Usecase("test/datasets/netscience.mtx", 4))); -#endif - -CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/traversal/legacy/bfs_ref.h b/cpp/tests/traversal/legacy/bfs_ref.h deleted file mode 100644 index 5efdce818e7..00000000000 --- a/cpp/tests/traversal/legacy/bfs_ref.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -template -void populate_neighbors(VT* indices, ET* offsets, VT w, std::vector& neighbors) -{ - ET edge_start = offsets[w]; - ET edge_end = offsets[w + 1]; - - neighbors.assign(indices + edge_start, indices + edge_end); -} - -// This implements the BFS based on (Brandes, 2001) for shortest path counting -template -void ref_bfs(VT* indices, - ET* offsets, - VT const number_of_vertices, - std::queue& Q, - std::stack& S, - std::vector& dist, - std::vector>& pred, - std::vector& sigmas, - VT source) -{ - std::vector neighbors; - pred.clear(); - pred.resize(number_of_vertices); - dist.assign(number_of_vertices, std::numeric_limits::max()); - sigmas.assign(number_of_vertices, 0); - dist[source] = 0; - sigmas[source] = 1; - Q.push(source); - // b. Traversal - while (!Q.empty()) { - VT v = Q.front(); - Q.pop(); - S.push(v); - populate_neighbors(indices, offsets, v, neighbors); - for (VT w : neighbors) { - // Path Discovery: - // Found for the first time? - if (dist[w] == std::numeric_limits::max()) { - dist[w] = dist[v] + 1; - Q.push(w); - } - // Path counting - // Edge(v, w) on a shortest path? - if (dist[w] == dist[v] + 1) { - sigmas[w] += sigmas[v]; - pred[w].push_back(v); - } - } - } -} diff --git a/cpp/tests/traversal/legacy/bfs_test.cu b/cpp/tests/traversal/legacy/bfs_test.cu deleted file mode 100644 index c6d3c96aa93..00000000000 --- a/cpp/tests/traversal/legacy/bfs_test.cu +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governin_from_mtxg permissions and - * limitations under the License. - */ - -#include "bfs_ref.h" - -#include -#include - -#include - -#include -#include - -#include - -#include -#include -#include - -// NOTE: This could be common to other files but we might not want the same precision -// depending on the algorithm -#ifndef TEST_EPSILON // It is currently use for relative error -#define TEST_EPSILON 0.0001 -#endif - -// NOTE: Defines under which values the difference should be discarded when -// considering values are close to zero -// i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is -// significant -#ifndef TEST_ZERO_THRESHOLD -#define TEST_ZERO_THRESHOLD 1e-10 -#endif -// ============================================================================ -// C++ Reference Implementation -// ============================================================================ -template -bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold) -{ - return ((zero_threshold > a && zero_threshold > b)) || - (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon)); -} - -// ============================================================================ -// Test Suite -// ============================================================================ -typedef struct BFS_Usecase_t { - std::string config_; // Path to graph file - std::string file_path_; // Complete path to graph using dataset_root_dir - int source_; // Starting point from the traversal - BFS_Usecase_t(const std::string& config, int source) : config_(config), source_(source) - { - const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); - if ((config_ != "") && (config_[0] != '/')) { - file_path_ = rapidsDatasetRootDir + "/" + config_; - } else { - file_path_ = config_; - } - }; -} BFS_Usecase; - -class Tests_BFS : public ::testing::TestWithParam { - raft::handle_t handle; - - public: - Tests_BFS() {} - - static void SetUpTestCase() {} - static void TearDownTestCase() {} - - virtual void SetUp() {} - virtual void TearDown() {} - - // VT vertex identifier data type - // ET edge identifier data type - // WT edge weight data type - // return_sp_counter should BFS return shortest path countner - template - void run_current_test(const BFS_Usecase& configuration) - { - // Step 1: Construction of the graph based on configuration - VT number_of_vertices; - ET number_of_edges; - bool directed = false; - auto csr = - cugraph::test::generate_graph_csr_from_mm(directed, configuration.file_path_); - cudaDeviceSynchronize(); - cugraph::legacy::GraphCSRView G = csr->view(); - G.prop.directed = directed; - - ASSERT_TRUE(configuration.source_ >= 0 && (VT)configuration.source_ < G.number_of_vertices) - << "Starting sources should be >= 0 and" - << " less than the number of vertices in the graph"; - - VT source = configuration.source_; - - number_of_vertices = G.number_of_vertices; - number_of_edges = G.number_of_edges; - - std::vector indices(number_of_edges); - std::vector offsets(number_of_vertices + 1); - - RAFT_CUDA_TRY( - cudaMemcpy(indices.data(), G.indices, sizeof(VT) * indices.size(), cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY( - cudaMemcpy(offsets.data(), G.offsets, sizeof(ET) * offsets.size(), cudaMemcpyDeviceToHost)); - - std::queue Q; - std::stack S; - std::vector ref_bfs_dist(number_of_vertices); - std::vector> ref_bfs_pred(number_of_vertices); - std::vector ref_bfs_sigmas(number_of_vertices); - - ref_bfs(indices.data(), - offsets.data(), - number_of_vertices, - Q, - S, - ref_bfs_dist, - ref_bfs_pred, - ref_bfs_sigmas, - source); - - // Device data for cugraph_bfs - rmm::device_vector d_cugraph_dist(number_of_vertices); - rmm::device_vector d_cugraph_pred(number_of_vertices); - rmm::device_vector d_cugraph_sigmas(number_of_vertices); - - std::vector cugraph_dist(number_of_vertices); - std::vector cugraph_pred(number_of_vertices); - std::vector cugraph_sigmas(number_of_vertices); - - // Don't pass valid sp_sp_counter ptr unless needed because it disables - // the bottom up flow - cugraph::bfs(handle, - G, - d_cugraph_dist.data().get(), - d_cugraph_pred.data().get(), - (return_sp_counter) ? d_cugraph_sigmas.data().get() : nullptr, - source, - G.prop.directed); - RAFT_CUDA_TRY(cudaMemcpy(cugraph_dist.data(), - d_cugraph_dist.data().get(), - sizeof(VT) * d_cugraph_dist.size(), - cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY(cudaMemcpy(cugraph_pred.data(), - d_cugraph_pred.data().get(), - sizeof(VT) * d_cugraph_pred.size(), - cudaMemcpyDeviceToHost)); - - if (return_sp_counter) { - RAFT_CUDA_TRY(cudaMemcpy(cugraph_sigmas.data(), - d_cugraph_sigmas.data().get(), - sizeof(double) * d_cugraph_sigmas.size(), - cudaMemcpyDeviceToHost)); - } - - for (VT i = 0; i < number_of_vertices; ++i) { - // Check distances: should be an exact match as we use signed int 32-bit - EXPECT_EQ(cugraph_dist[i], ref_bfs_dist[i]) - << "[MISMATCH] vaid = " << i << ", cugraph = " << cugraph_sigmas[i] - << " c++ ref = " << ref_bfs_sigmas[i]; - // Check predecessor: We do not enforce the predecessor, we simply verifiy - // that the predecessor obtained with the GPU implementation is one of the - // predecessors obtained during the C++ BFS traversal - VT pred = cugraph_pred[i]; // It could be equal to -1 if the node is never reached - constexpr VT invalid_vid = cugraph::legacy::invalid_vertex_id::value; - if (pred == invalid_vid) { - EXPECT_TRUE(ref_bfs_pred[i].empty()) - << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph had not predecessor," - << "while c++ ref found at least one."; - } else { - // This can get expensive to check, we could have simply verified that based - // on the the distance from the source to the predecessor, but this ensures that there - // are no misassignations - auto it = std::find(ref_bfs_pred[i].begin(), ref_bfs_pred[i].end(), pred); - EXPECT_TRUE(it != ref_bfs_pred[i].end()) - << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph = " << cugraph_sigmas[i] - << " , c++ ref did not consider it as a predecessor."; - } - - if (return_sp_counter) { - EXPECT_TRUE( - compare_close(cugraph_sigmas[i], ref_bfs_sigmas[i], TEST_EPSILON, TEST_ZERO_THRESHOLD)) - << "[MISMATCH] vaid = " << i << ", cugraph = " << cugraph_sigmas[i] - << " c++ ref = " << ref_bfs_sigmas[i]; - } - } - } -}; - -// ============================================================================ -// Tests -// ============================================================================ - -// We don't need to test WT for both float and double since it's anyway ignored in BFS -TEST_P(Tests_BFS, CheckUint32_NO_SP_COUNTER) -{ - run_current_test(GetParam()); -} -TEST_P(Tests_BFS, CheckInt_NO_SP_COUNTER) { run_current_test(GetParam()); } -TEST_P(Tests_BFS, CheckInt64_NO_SP_COUNTER) -{ - run_current_test(GetParam()); -} - -TEST_P(Tests_BFS, CheckUint32_SP_COUNTER) -{ - run_current_test(GetParam()); -} -TEST_P(Tests_BFS, CheckInt_SP_COUNTER) { run_current_test(GetParam()); } -TEST_P(Tests_BFS, CheckInt64_SP_COUNTER) -{ - run_current_test(GetParam()); -} - -INSTANTIATE_TEST_SUITE_P(simple_test, - Tests_BFS, - ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0), - BFS_Usecase("test/datasets/polbooks.mtx", 0), - BFS_Usecase("test/datasets/netscience.mtx", 0), - BFS_Usecase("test/datasets/netscience.mtx", 100), - BFS_Usecase("test/datasets/wiki2003.mtx", 1000), - BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000))); - -CUGRAPH_TEST_PROGRAM_MAIN()